1 OVERVIEW OF THIS DOCUMENT



This document shows my R codes that I used to prepare and analyze Hooli’s employee survey data. Please use the table of content on the left side of this document to navigate the document. If you click “Show” button in each section, you can see my codes. The document is structured as follows:

    1. Overview of this document
    1. Data preparation & processing for analysis
    1. Data Analysis
    • 3.1 Calculate the favorable scores for all questions and factors for Hooli overall
    • 3.2 Calculate engagement factor (favorable) scores for all tenure groups
    • 3.3 Calculate favorable scores for all questions and factors by gender
    • 3.4 Correlate all on-engagement questions to the engagement factor for Hooli overall
    • 3.5 Calculate engagement & leadership factor (favorable) scores for all countries
    • 3.6 Demographic Composition of the Company



2 DATA PREPARATION & PROCESSING FOR ANALYSIS



2.1 Loading Data

#check and install required packages to successfully run this rmd

required_packages <- c(
  "tidyselect", "xfun", "corrplot", "pander", "reshape2", "purrr", "tcltk", "colorspace",
  "vctrs", "summarytools", "generics", "htmltools", "yaml", "grDevices", "base64enc", "utf8",
  "rlang", "pillar", "foreign", "glue", "withr", "pryr", "readxl", "matrixStats",
  "lifecycle", "plyr", "NCmisc", "stringr", "munsell", "gtable", "cellranger", "codetools",
  "evaluate", "labeling", "knitr", "fastmap", "fansi", "methods", "Rcpp", "backports",
  "scales", "checkmate", "magick", "farver", "rapportools", "ggplot2", "stats", "datasets",
  "graphics", "digest", "stringi", "dplyr", "grid", "cli", "tools", "magrittr",
  "tibble", "Amelia", "tidyr", "pkgconfig", "MASS", "utils", "timechange", "lubridate",
  "rmarkdown", "base", "rstudioapi", "R6", "compiler"
)

for (pkg in required_packages) {
  if (!(pkg %in% installed.packages())) {
    install.packages(pkg, dependencies = TRUE)
  }
}

#load the data

library(readxl)
df_original <- read_xlsx("data/sr._people_science_analyst_assignment_dataset_2024.xlsx")



2.2 Data Exploration & Preparation



2.2.1 Examining the macro features of the data


I first check the macro features of this df to make sure that everything is correct.


2.2.1.1 transforming ‘lea_3’ into a numeric variable


‘lea_3’ should be a numeric variable. After checking the data, employee ID M01562’s response is only missing for’lea_3’. Therefore, this ‘N/A’ should be recoded as -1.


2.2.1.2 transforming’hiredate’ into a date variable


‘hiredate’ should be a date variable. It is also important to note that each country follows a different date format (it’s a good practice to check the data in Excel before loading in into R to prevent such a mistake). So, in converting this variable to date-type, I need to specify this format in my code.

  - Australia: Day/Month/Year
  - Denmark: Day/Month/Year
  - France: Day/Month/Year
  - Germany: Day/Month/Year
  - India: Day/Month/Year
  - UK: Day/Month/Year
  - United Kingdom: Day/Month/Year
  - United States: Month/Day/Year
  - USA: Month/Day/Year
  - Canada: Month/Day/Year
  - China: Year/Month/Day



# checking the macro-level trends of the dataset 

head(df_original, 5)
tail(df_original, 5)
str(df_original)
## tibble [2,651 × 27] (S3: tbl_df/tbl/data.frame)
##  $ eeid          : chr [1:2651] "M01434" "M00631" "M00325" "M00805" ...
##  $ ali_1         : num [1:2651] 5 4 5 5 5 4 4 5 5 5 ...
##  $ ali_2         : num [1:2651] 5 4 5 5 5 4 3 5 5 5 ...
##  $ ali_3         : num [1:2651] 4 4 4 5 4 3 4 4 4 5 ...
##  $ col_1         : num [1:2651] 5 4 5 5 5 4 3 4 5 5 ...
##  $ col_2         : num [1:2651] 3 4 4 5 5 4 3 5 4 4 ...
##  $ col_3         : num [1:2651] 1 4 5 5 5 4 3 5 4 5 ...
##  $ eng_1         : num [1:2651] 5 4 5 5 5 4 4 5 5 5 ...
##  $ eng_2         : num [1:2651] 5 4 5 5 5 4 3 4 5 5 ...
##  $ eng_3         : num [1:2651] 5 4 5 5 5 3 4 5 5 4 ...
##  $ eng_4         : num [1:2651] 5 3 4 5 4 4 3 4 5 5 ...
##  $ eng_5         : num [1:2651] 5 3 3 5 3 4 3 5 5 3 ...
##  $ inc_1         : num [1:2651] 5 4 5 5 5 4 4 5 5 5 ...
##  $ inc_2         : num [1:2651] 5 4 5 5 5 3 4 4 4 4 ...
##  $ inc_3         : num [1:2651] 5 4 5 5 5 2 4 4 5 5 ...
##  $ inc_4         : num [1:2651] 5 4 5 4 4 4 4 5 5 4 ...
##  $ inc_5         : num [1:2651] 3 3 5 3 5 4 4 5 5 5 ...
##  $ lea_1         : num [1:2651] 4 4 5 5 5 2 5 5 5 3 ...
##  $ lea_2         : num [1:2651] 5 5 5 5 5 4 5 5 5 5 ...
##  $ lea_3         : chr [1:2651] "5" "5" "5" "5" ...
##  $ lea_4         : num [1:2651] 5 5 5 5 5 4 5 5 5 5 ...
##  $ age           : chr [1:2651] "35-44" "45-54" "18-24" "45-54" ...
##  $ hiredate      : chr [1:2651] "25/02/2024" "02/10/2022" "20/10/2023" "08/12/2018" ...
##  $ race          : chr [1:2651] NA NA NA "White" ...
##  $ gender        : chr [1:2651] "Male" "Male" "Male" "Male" ...
##  $ manager_status: chr [1:2651] "Non-Manager" "Non-Manager" "Non-Manager" "Manager" ...
##  $ country       : chr [1:2651] "France" "France" "United Kingdom" "United States" ...
library(Amelia)
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(df_original, main = 'missing data map for the Hooli Survey Data', col = c('purple', 'black'), legend = TRUE)

# except for race, most values are not missing!

# Creating a new df that for data preparation & processing for analysis while preserving the original df

df <- df_original 

# Fixing errors in 'lea_3'

table(df_original$lea_3)
## 
##   -1    1    2    3    4    5   99  N/A 
##   12   22   58  197 1147 1213    1    1
df[df$lea_3 == "N/A", ]
df$lea_3[df$lea_3 == "N/A"] <- -1
table(df$lea_3)
## 
##   -1    1    2    3    4    5   99 
##   13   22   58  197 1147 1213    1
#transforming 'lea_3' to a numeric variable
df$lea_3 <- as.numeric(df$lea_3)

#transforming hiredate into a date variable using different date formats for different countries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df <- df %>%
  mutate(hiredate = case_when(
    country %in% c("Australia", "Denmark", "France", "Germany", "India", "UK", "United Kingdom") ~ as.Date(hiredate, format = "%d/%m/%Y"),
    country %in% c("United States", "USA", "Canada") ~ as.Date(hiredate, format = "%m/%d/%Y"),
    country == "China" ~ as.Date(hiredate, format = "%Y/%m/%d"),
    TRUE ~ as.Date(NA)  
  ))

library(summarytools)
dfSummary(df$hiredate)
## df$hiredate was converted to a data frame



2.2.1.3 Handling outliers, ‘99’, in various variables


I identified several outliers, value ‘99’, from the following variables: ali_1, ali_2, ali_3, col_1, col_2, col_3, and lea_4. The value ‘99’ is a value that is out of the scale of -1 to 5. I discovered that all these responses come from Denmark. Thus, it was probably a systematic coding error for these responses from Denmark. Given that these are psychological measures, it doesn’t make sense code 99 might denote an exceptional response out of -1 to 5 scale. Thus, I deem 99 as a coding error. Additionally, these employees are less than 1% of the participants in the data and that they completed all the other questions in the survey. Therefore, I recode 99 to -1, the code for missing value.


library(summarytools)
dfSummary(df)
# examine the responses of the employees who responded '99' to one of these questions.
df[df$ali_1 == 99, ] # other responses are in the normal scale range & 2 participants = M00238 & M00568
df[df$ali_2 == 99, ] # except for ali_2 & lea_4, all the other responses are in the normal scale range & 1 participant = M01455
df[df$ali_3 == 99, ] # other responses are in the normal scale range & 1 participant = M02319
df[df$col_1 == 99, ] # other responses are in the normal scale range & 2 participants = M01339, M01843
df[df$col_2 == 99, ] # other responses are in the normal scale range & 1 participant = M00168
df[df$col_3 == 99, ] # other responses are in the normal scale range & 1 participant = M01393
df[df$lea_3 == 99, ] # except fo lea_3, all the other responses are in the normal scale range & 1 participant = M01484
df[df$lea_4 == 99, ] # except for ali_2 & lea_4, all the other responses are in the normal scale range & 1 participant = M01455
# recode values with 99 to -1.

df$ali_1[df$ali_1 == 99] <- -1
df$ali_2[df$ali_2 == 99] <- -1
df$ali_3[df$ali_3 == 99] <- -1
df$col_1[df$col_1 == 99] <- -1
df$col_2[df$col_2 == 99] <- -1
df$col_3[df$col_3 == 99] <- -1
df$lea_3[df$lea_3 == 99] <- -1
df$lea_4[df$lea_4 == 99] <- -1


2.2.1.4 Handling missing values


col_1, eng_2, inc_2, & inc_5 have 1 missing value each. It should be correctly coded as -1, the correct code for missing values.


df[is.na(df$col_1), ] #for employee_ID M02429, only col_1 is missing. It should be coded as -1. 
df[is.na(df$eng_2), ] #for employee_ID M01597, only eng_2 is missing. It should be coded as -1.
df[is.na(df$inc_2), ] #for employee_ID M01481, only inc_2 is missing. It should be coded as -1.
df[is.na(df$inc_5), ] #for employee_ID M01722, only inc_5 is missing. It should be coded as -1. 
df$col_1[is.na(df$col_1)] <- -1
df$eng_2[is.na(df$eng_2)] <- -1
df$inc_2[is.na(df$inc_2)] <- -1
df$inc_5[is.na(df$inc_5)] <- -1


2.2.1.5 Checking for duplicates


there are no duplicates based on employee ID. each response comes from a unique employee.

sum(duplicated(df$eeid))
## [1] 0


2.2.1.6 fixing groups in ‘country’ variable


For the ‘country’ variable, ‘UK’ and ‘United Kingdom’ should be merged into one category. Similarly, ‘USA’ and ‘United States’ should be merged into one category. Let’s combine these categories for each country.

table(df$country)
## 
##      Australia         Canada          China        Denmark         France 
##            101             84             60             24            136 
##        Germany          India             UK United Kingdom  United States 
##             48            288             11            348           1520 
##            USA 
##             31
df$country[df$country == "UK"] <- "United Kingdom"
df$country[df$country == "USA"] <- "United States"
table(df$country)
## 
##      Australia         Canada          China        Denmark         France 
##            101             84             60             24            136 
##        Germany          India United Kingdom  United States 
##             48            288            359           1551


2.2.1.7 consolidating ‘age’ variable’s categories


* the group ‘19’, ‘21’ should be consolidated into ‘18-24’. * the group ‘26’, ‘28’ should be consolidated into ‘25-34’. * the group ‘39’, ‘42’ should be consolidated into ‘35-44’. * the group ‘48’ should be consolidated into ‘45-54’.


table(df$age, useNA = "always")
## 
## 18-24    19    21 25-34    26    28 35-44    39    42 45-54    48 55-64   65+ 
##   147     1     1  1266     2     1   795     1     1   301     1    78    54 
##   N/A  <NA> 
##     2     0
df$age[df$age == "19"] <- "18-24"
df$age[df$age == "21"] <- "18-24"
df$age[df$age == "26"] <- "25-34"
df$age[df$age == "28"] <- "25-34"
df$age[df$age == "39"] <- "35-44"
df$age[df$age == "42"] <- "35-44"
df$age[df$age == "48"] <- "45-54"


2.2.1.8 correctly consolidating ‘race’ variable’s categories


For ‘race’ variable, “Black or African American” & “Black or African Americans” should be merged into one group.

table(df$race, useNA = "always")
## 
##            American Indian/Alaskan Native 
##                                         2 
##                                     Asian 
##                                       359 
##                 Black or African American 
##                                        37 
##                Black or African Americans 
##                                        11 
##                        Hispanic or Latino 
##                                        80 
## Native Hawaiian or Other Pacific Islander 
##                                         2 
##                         Two or More Races 
##                                        43 
##                                     White 
##                                      1050 
##                                      <NA> 
##                                      1067
df$race[df$race == "Black or African Americans"] <- "Black or African American"


2.2.1.9 correctly consolidating ‘gender’ variable’s categories


For ‘gender’ variable, these groups below need to be consolidated: * Consolidate “Male” & “Man” into one group * Consolidate “Female” & “Woman” into one group

table(df$gender, useNA = "always")
## 
##  Female    Male     Man Unknown   Woman    <NA> 
##     816    1826       4       1       4       0
df$gender[df$gender == "Man"] <- "Male"
df$gender[df$gender == "Woman"] <- "Female"


2.2.1.10 examining race variable’s missing observations


Based on the contingency table below, 40.2% of race’s variable’s observations are missing:

  • 101/101 employees in Australia have no information about their race.
  • 60/84 employees in Canada have no information about their race except 24 who are white.
  • 55/60 employees in China have no information about their race except 5 who are white.
  • 24/24 employees in Denmark have no information about their race.
  • 136/136 employees in France have no information about their race.
  • 48/48 employees in Germany have no information about their race.
  • 288/288 employees in India have no information about their race.
  • 355/359 employees in the United Kingdom have no information about their race.
  • 1017/1017 employees in the United States have demographic information about race.



Except for the United States, all the countries with missing information about race are largely ethnically homogeneous (e.g., Denmark). Therefore, the company might not collect information about employee’s race as the variance in race among employees might be too small to be a meaningful (For example, according to demographic research, about 90.2 percent of the population in Australia are white). Therefore, conducting an analysis about race for other countries except for the United States would not provide meaningful insights to Hooli.


c_t <- table(df$race, df$country, useNA = "always")
c_t
##                                            
##                                             Australia Canada China Denmark
##   American Indian/Alaskan Native                    0      0     0       0
##   Asian                                             0      0     0       0
##   Black or African American                         0      0     0       0
##   Hispanic or Latino                                0      0     0       0
##   Native Hawaiian or Other Pacific Islander         0      0     0       0
##   Two or More Races                                 0      0     0       0
##   White                                             0     24     5       0
##   <NA>                                            101     60    55      24
##                                            
##                                             France Germany India United Kingdom
##   American Indian/Alaskan Native                 0       0     0              0
##   Asian                                          0       0     0              0
##   Black or African American                      0       0     0              0
##   Hispanic or Latino                             0       0     0              0
##   Native Hawaiian or Other Pacific Islander      0       0     0              0
##   Two or More Races                              0       0     0              0
##   White                                          0       0     0              4
##   <NA>                                         136      48   288            355
##                                            
##                                             United States <NA>
##   American Indian/Alaskan Native                        2    0
##   Asian                                               359    0
##   Black or African American                            48    0
##   Hispanic or Latino                                   80    0
##   Native Hawaiian or Other Pacific Islander             2    0
##   Two or More Races                                    43    0
##   White                                              1017    0
##   <NA>                                                  0    0


2.2.1.11 Examining ‘hiredate’ variable.


None of the observations are missing.

df_2 <- subset(df, select = c(hiredate))
library(Amelia)
missmap(df_2)
rm(df_2)

#the earliest & latest date for hiredate
min(df$hiredate, na.rm = TRUE) # 2014-12-24
## [1] "2004-12-24"
max(df$hiredate, na.rm = TRUE) # 2024-02-29
## [1] "2024-02-29"
#let's create a new variable named hireyear 
df$hireyear <- format(df$hiredate, "%Y")
table(df$hireyear, useNA = "always")
## 
## 2004 2005 2006 2010 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 <NA> 
##    1    1    1    1   41   40   31   42  125  129  307  308  632  954   38    0
library(ggplot2)

ggplot(df, aes(x=hireyear)) + geom_bar()

ggplot(df, aes(x=hiredate)) + geom_histogram() #this company hired employees a ton in 2022!!
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.


2.2.1.12 creating a new variable called ‘tenure’ by using hiredate


Substract ‘hiredate’ from March 1st 2024, the date instructed by the technical assessment rubric to create different tenure groups by using ‘hiredate’.Given that there are only 4 employees in ‘10+ years’ groups, we do not report any findings about this group as the reporting minimum is 5 per group.


#let's create the cutpoint, which is 2024-03-01, to create tenure groups
cutpoint <- as.Date("2024-03-01")
#given that the average number of days in a month is 365/12 = 30.4166666667, 
tenure_days <- as.numeric(difftime(cutpoint, df$hiredate, units = "days"))
tenure_months <- as.numeric(difftime(cutpoint, df$hiredate, units = "days")/(365/12))

#create tenure_groups 
tenure_groups <- cut(tenure_months,
                      breaks = c(0, 3, 6, 12, 24, 48, 72, 120, Inf),  # In months
                      labels = c("Under 3 months", "3-6 months", "6-12 months", "1-2 years", "2-4 years", "4-6 years", "6-10 years", "10+ years"), right = FALSE) # right = FALSE to EXCLUDE the right side month in the cateogry

#the distribution of employees in each tenure group
table(tenure_groups, useNA = "always")
## tenure_groups
## Under 3 months     3-6 months    6-12 months      1-2 years      2-4 years 
##             64            325            507            670            644 
##      4-6 years     6-10 years      10+ years           <NA> 
##            274            163              4              0
# Add tenure_group variable to data frame
df$tenure_group <- as.character(tenure_groups)

# Print the data frame with the new tenure_group variable
table(df$tenure_group, useNA = "always")
## 
##      1-2 years      10+ years      2-4 years     3-6 months      4-6 years 
##            670              4            644            325            274 
##     6-10 years    6-12 months Under 3 months           <NA> 
##            163            507             64              0


2.2.1.13 recoding -1 (missing values) to NA


I changed all the missing values (-1) to NA (I named this new dataset df_2) to make sure that ‘-1’ does not influence the results of our analysis.


df_2 <- df

df_2$ali_1[df_2$ali_1 == -1] <- NA 
df_2$ali_2[df_2$ali_2 == -1] <- NA
df_2$ali_3[df_2$ali_3 == -1] <- NA

df_2$col_1[df_2$col_1 == -1] <- NA 
df_2$col_2[df_2$col_2 == -1] <- NA 
df_2$col_3[df_2$col_3 == -1] <- NA 

df_2$eng_1[df_2$eng_1 == -1] <- NA 
df_2$eng_2[df_2$eng_2 == -1] <- NA
df_2$eng_3[df_2$eng_3 == -1] <- NA 
df_2$eng_4[df_2$eng_4 == -1] <- NA
df_2$eng_5[df_2$eng_5 == -1] <- NA

df_2$inc_1[df_2$inc_1 == -1] <- NA 
df_2$inc_2[df_2$inc_2 == -1] <- NA
df_2$inc_3[df_2$inc_3 == -1] <- NA
df_2$inc_4[df_2$inc_4 == -1] <- NA
df_2$inc_5[df_2$inc_5 == -1] <- NA 

df_2$lea_1[df_2$lea_1 == -1] <- NA
df_2$lea_2[df_2$lea_2 == -1] <- NA
df_2$lea_3[df_2$lea_3 == -1] <- NA
df_2$lea_4[df_2$lea_4 == -1] <- NA

df_2$age[df_2$age == "N/A"] <- NA

#we convert "Unknown" to NA as this category has less than 5 people and we do not report.
df_2$gender[df_2$gender == "Unknown"] <- NA

dfSummary(df_2)
#the end of data-processing & preparation for analysis



3 DATA ANALYSIS



3.1 Calculate the Favorable Scores for Qll Questions and Factors for Hooli Overall.



3.1.1 Calculating the favorable scores for all questions



Favorable scores are defined as the sum of the number of answers that are agree and strongly agree over the number of all non missing responses.

#let's create a function that automates the process of calculating favorable scores for all questions at once:

function_favorability <- function(df) {
  favorability_score <- numeric(ncol(df))  # Include all columns
  column_names <- names(df)  # Get the column names
  
  for (i in 1:ncol(df)) {  # Start from column 1
    # Calculate the proportion
    favorability_score[i] <- sum(df[[i]] %in% c(4,5)) / sum(df[[i]] %in% c(1:5))
  }
  
  # Combine favorability scores and column names into a data frame
  result <- data.frame(column_name = column_names, favorability_score = favorability_score)
  
  return(result)
}

# Calculate favorability_score for columns 2 to 21
favorability_score <- function_favorability(df_2[, 2:21])

colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.8864413
## 2            ali_2          0.8325086
## 3            ali_3          0.6844309
## 4            col_1          0.8237301
## 5            col_2          0.6013986
## 6            col_3          0.8057883
## 7            eng_1          0.9209830
## 8            eng_2          0.8507181
## 9            eng_3          0.8010610
## 10           eng_4          0.6852062
## 11           eng_5          0.7222853
## 12           inc_1          0.8958967
## 13           inc_2          0.7247278
## 14           inc_3          0.8119954
## 15           inc_4          0.6996521
## 16           inc_5          0.7749616
## 17           lea_1          0.8335234
## 18           lea_2          0.8491641
## 19           lea_3          0.8949564
## 20           lea_4          0.8773728
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  
print(favorability_score_percent)
##    question_number favorability_score
## 1            ali_1                 89
## 2            ali_2                 83
## 3            ali_3                 68
## 4            col_1                 82
## 5            col_2                 60
## 6            col_3                 81
## 7            eng_1                 92
## 8            eng_2                 85
## 9            eng_3                 80
## 10           eng_4                 69
## 11           eng_5                 72
## 12           inc_1                 90
## 13           inc_2                 72
## 14           inc_3                 81
## 15           inc_4                 70
## 16           inc_5                 77
## 17           lea_1                 83
## 18           lea_2                 85
## 19           lea_3                 89
## 20           lea_4                 88
#graph favorable score for each question
##setting different colors for different factors

library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")

##

group_colors <- c(
  "Alignment" = "#D08770",  # Light orange for Group 1
  "Collaboration" = "#A3BE8C",  # Light green for Group 2
  "Engagement" = "#5E81AC",  # Light blue for Group 3
  "Inclusion" = "#EBCB8B",  # Light yellow for Group 4
  "Leadership" = "#B48EAD"   # Mild purple for Group 5
)

## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Question Item") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars



3.1.2 Calculating a ‘factor favorable score’, which is the average favorable score among questions within the same factor, for each factor



#let's create a function that automates the process of calculating of factor favorable score.

## Extract factor abbreviations first
factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number)) #substitute "-.*" to " " then only save unique abbreviations

## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.8011269
## 2                 col              0.7436390
## 3                 eng              0.7960507
## 4                 inc              0.7814467
## 5                 lea              0.8637542
#change the factor favorable scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100

library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")



#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 80     Alignment
## 1            ali_1                 89     Alignment
## 2            ali_2                 83     Alignment
## 3            ali_3                 68     Alignment
## 22             col                 74 Collaboration
## 4            col_1                 82 Collaboration
## 5            col_2                 60 Collaboration
## 6            col_3                 81 Collaboration
## 23             eng                 80    Engagement
## 7            eng_1                 92    Engagement
## 8            eng_2                 85    Engagement
## 9            eng_3                 80    Engagement
## 10           eng_4                 69    Engagement
## 11           eng_5                 72    Engagement
## 24             inc                 78     Inclusion
## 12           inc_1                 90     Inclusion
## 13           inc_2                 72     Inclusion
## 14           inc_3                 81     Inclusion
## 15           inc_4                 70     Inclusion
## 16           inc_5                 77     Inclusion
## 25             lea                 86    Leadership
## 17           lea_1                 83    Leadership
## 18           lea_2                 85    Leadership
## 19           lea_3                 89    Leadership
## 20           lea_4                 88    Leadership
## Create a ggplot bar plot with the preselected colors (vertical)
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores (vertical)

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's enagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "Overall",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
#create a graph that shows enagement favorable score across tenure groups (all)

engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
#Desired order of question_number in reverse as I am making a horizontal graph
desired_order <- c("eng_5", "eng_4", "eng_3", "eng_2", "eng_1", "eng")

#Set the factor levels in reverse order
engagement_favorable_score_for_all$question_number <- factor(
  engagement_favorable_score_for_all$question_number,
  levels = (desired_order)  # Reverse the desired order
)

#Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

#Color palette for highlighting
highlight_colors <- c("first" = "#9A36B2",  # Purple for the 'first' category
                      "other" = "#C9A5DD")  # light purple for the 'other' category

#Create a horizontal bar plot with ggplot
ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) + 
  coord_flip() +  
  labs(
    x = "Factor Items",  
    y = "Favorability Score", 
    title = "Engagement Factor Favorable Scores",
    caption = "Favorability score represents the percentage of respondents who agreed or strongly agreed with the question item over all responses. Factor favorability score is an average of all the items for the factor."  # Add the caption
  ) +
  theme_minimal() + 
  theme(
    axis.text.y = element_text(size = 12),  
    axis.text.x = element_text(size = 12), 
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),  
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14),
  plot.caption = element_text(size = 6, hjust = 0)  
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), 
            hjust = -0.3,  
            size = 4) +  
  scale_fill_manual(values = highlight_colors)  



3.2 Calculate Engagement Factor (Favorable) Scores by All Tenure Groups



3.2.1 Engagement factor favorable scores for employees whose tenure are “Under 3 Months”



n = 64

# Calculate favorability_score for columns 8 to 12 (engagement questions) by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$tenure_group == "Under 3 months", 8:12])

colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1           0.984375
## 2           eng_2           0.859375
## 3           eng_3           0.906250
## 4           eng_4           0.890625
## 5           eng_5           0.828125
# Calculate the average engagement factor favorable score for those under 3 months
average_score <- mean(favorability_score$favorability_score, na.rm = TRUE)


average_row <- data.frame(
  question_number = "average",  
  favorability_score = average_score
)

favorability_score_with_average <- rbind(favorability_score, average_row)
print(favorability_score_with_average)
##   question_number favorability_score
## 1           eng_1           0.984375
## 2           eng_2           0.859375
## 3           eng_3           0.906250
## 4           eng_4           0.890625
## 5           eng_5           0.828125
## 6         average           0.893750
favorability_score_with_average_percent <- favorability_score_with_average
favorability_score_with_average_percent$favorability_score <- round(favorability_score_with_average_percent $favorability_score,2)*100

print(favorability_score_with_average_percent)
##   question_number favorability_score
## 1           eng_1                 98
## 2           eng_2                 86
## 3           eng_3                 91
## 4           eng_4                 89
## 5           eng_5                 83
## 6         average                 89
#visualize the favorability scores


# Distinguish the average row from other item rows
favorability_score_with_average_percent$highlight <- ifelse(favorability_score_with_average_percent$question_number == favorability_score_with_average_percent$question_number[6], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # Purple for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

ggplot(favorability_score_with_average_percent, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Employees with Under 3 Months Tenure") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  

#append this group's engagement factor favorability score to the tenure group df
engagement_by_tenure <- rbind(engagement_by_tenure, average_row)
engagement_by_tenure$question_number[engagement_by_tenure$question_number == "average"] <- "Under 3 Months"



3.2.2 Engagement factor favorability scores for employees whose tenure are “3-6 months”



n = 325

#3-6 months
df_2[df_2$tenure_group == "3-6 months" & !is.na(df_2$tenure_group), ]
# Calculate favorability_score for columns 8 to 12 (engagement questions)
favorability_score <- function_favorability(df_2[df_2$tenure_group == "3-6 months", 8:12])

colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1          0.9506173
## 2           eng_2          0.9040248
## 3           eng_3          0.8796296
## 4           eng_4          0.8209877
## 5           eng_5          0.8580247
# Calculate the average engagement factor favorable score for those with 3-6 months tenure
average_score <- mean(favorability_score$favorability_score, na.rm = TRUE)

average_row <- data.frame(
  question_number = "average",  
  favorability_score = average_score
)

favorability_score_with_average <- rbind(favorability_score, average_row)
print(favorability_score_with_average)
##   question_number favorability_score
## 1           eng_1          0.9506173
## 2           eng_2          0.9040248
## 3           eng_3          0.8796296
## 4           eng_4          0.8209877
## 5           eng_5          0.8580247
## 6         average          0.8826568
favorability_score_with_average_percent <- favorability_score_with_average
favorability_score_with_average_percent$favorability_score <- round(favorability_score_with_average_percent $favorability_score,2)*100

print(favorability_score_with_average_percent)
##   question_number favorability_score
## 1           eng_1                 95
## 2           eng_2                 90
## 3           eng_3                 88
## 4           eng_4                 82
## 5           eng_5                 86
## 6         average                 88
#visualize the favorability scores


# Distinguish the average row from other item rows
favorability_score_with_average_percent$highlight <- ifelse(favorability_score_with_average_percent$question_number == favorability_score_with_average_percent$question_number[6], "first", "other")

# graph showing engagement factor scores for the 3-6 months tenure group

ggplot(favorability_score_with_average_percent, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Employees with 3-6 Months Tenure") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  

#append this group's engagement factor favorability score to the tenure group df
engagement_by_tenure <- rbind(engagement_by_tenure, average_row)
engagement_by_tenure$question_number[engagement_by_tenure$question_number == "average"] <- "3-6 Months"



3.2.3 Engagement factor favorability scores for employees whose tenure are “6-12 months”



n = 507

#6-12 months
df_2[df_2$tenure_group == "6-12 months" & !is.na(df_2$tenure_group), ]
# Calculate favorability_score for columns 8 to 12 (engagement questions)
favorability_score <- function_favorability(df_2[df_2$tenure_group == "6-12 months", 8:12])

colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1          0.9090909
## 2           eng_2          0.8106509
## 3           eng_3          0.8134921
## 4           eng_4          0.7199211
## 5           eng_5          0.7312253
# Calculate the average engagement factor favorable score for those with 6-12 months tenure
average_score <- mean(favorability_score$favorability_score, na.rm = TRUE)


average_row <- data.frame(
  question_number = "average",  
  favorability_score = average_score
)

favorability_score_with_average <- rbind(favorability_score, average_row)
print(favorability_score_with_average)
##   question_number favorability_score
## 1           eng_1          0.9090909
## 2           eng_2          0.8106509
## 3           eng_3          0.8134921
## 4           eng_4          0.7199211
## 5           eng_5          0.7312253
## 6         average          0.7968761
favorability_score_with_average_percent <- favorability_score_with_average
favorability_score_with_average_percent$favorability_score <- round(favorability_score_with_average_percent $favorability_score,2)*100

print(favorability_score_with_average_percent)
##   question_number favorability_score
## 1           eng_1                 91
## 2           eng_2                 81
## 3           eng_3                 81
## 4           eng_4                 72
## 5           eng_5                 73
## 6         average                 80
#visualize the favorability scores


# Distinguish the average row from other item rows
favorability_score_with_average_percent$highlight <- ifelse(favorability_score_with_average_percent$question_number == favorability_score_with_average_percent$question_number[6], "first", "other")

# graph showing engagement factor scores for the 6-12 months tenure group
ggplot(favorability_score_with_average_percent, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Employees with 6-12 Months Tenure") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  

#append this group's engagement factor favorability score to the tenure group df
engagement_by_tenure <- rbind(engagement_by_tenure, average_row)
engagement_by_tenure$question_number[engagement_by_tenure$question_number == "average"] <- "6-12 Months"



3.2.4 Engagement factor favorability scores for employees whose tenure are “1-2 years”



n= 670

#1-2 years
df_2[df_2$tenure_group == "1-2 years" & !is.na(df_2$tenure_group), ]
# Calculate favorability_score for columns 8 to 12 (engagement questions)
favorability_score <- function_favorability(df_2[df_2$tenure_group == "1-2 years", 8:12])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1          0.9235382
## 2           eng_2          0.8579970
## 3           eng_3          0.7976012
## 4           eng_4          0.6616766
## 5           eng_5          0.7140719
# Calculate the average engagement factor favorable score for those with 1-2 years tenure
average_score <- mean(favorability_score$favorability_score, na.rm = TRUE)


average_row <- data.frame(
  question_number = "average",  
  favorability_score = average_score
)

favorability_score_with_average <- rbind(favorability_score, average_row)
print(favorability_score_with_average)
##   question_number favorability_score
## 1           eng_1          0.9235382
## 2           eng_2          0.8579970
## 3           eng_3          0.7976012
## 4           eng_4          0.6616766
## 5           eng_5          0.7140719
## 6         average          0.7909770
favorability_score_with_average_percent <- favorability_score_with_average
favorability_score_with_average_percent$favorability_score <- round(favorability_score_with_average_percent $favorability_score,2)*100

print(favorability_score_with_average_percent)
##   question_number favorability_score
## 1           eng_1                 92
## 2           eng_2                 86
## 3           eng_3                 80
## 4           eng_4                 66
## 5           eng_5                 71
## 6         average                 79
#visualize the favorability scores


# Distinguish the average row from other item rows
favorability_score_with_average_percent$highlight <- ifelse(favorability_score_with_average_percent$question_number == favorability_score_with_average_percent$question_number[6], "first", "other")


# graph showing engagement factor scores for the 1-2 years tenure group
ggplot(favorability_score_with_average_percent, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Employees with 1-2 years Tenure") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  

#append this group's engagement factor favorability score to the tenure group df
engagement_by_tenure <- rbind(engagement_by_tenure, average_row)
engagement_by_tenure$question_number[engagement_by_tenure$question_number == "average"] <- "1-2 Years"



3.2.5 Engagement factor favorability scores for employees whose tenure are “2-4 years”



n = 644

#2-4 years
df_2[df_2$tenure_group == "2-4 years" & !is.na(df_2$tenure_group), ]
# Calculate favorability_score for columns 8 to 12 (engagement questions)
favorability_score <- function_favorability(df_2[df_2$tenure_group == "2-4 years", 8:12])

colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1          0.9144635
## 2           eng_2          0.8429238
## 3           eng_3          0.7659906
## 4           eng_4          0.6209048
## 5           eng_5          0.6931464
# Calculate the average engagement factor favorable score for those with 2-4 years tenure
average_score <- mean(favorability_score$favorability_score, na.rm = TRUE)


average_row <- data.frame(
  question_number = "average",  
  favorability_score = average_score
)

favorability_score_with_average <- rbind(favorability_score, average_row)
print(favorability_score_with_average)
##   question_number favorability_score
## 1           eng_1          0.9144635
## 2           eng_2          0.8429238
## 3           eng_3          0.7659906
## 4           eng_4          0.6209048
## 5           eng_5          0.6931464
## 6         average          0.7674858
favorability_score_with_average_percent <- favorability_score_with_average
favorability_score_with_average_percent$favorability_score <- round(favorability_score_with_average_percent $favorability_score,2)*100

print(favorability_score_with_average_percent)
##   question_number favorability_score
## 1           eng_1                 91
## 2           eng_2                 84
## 3           eng_3                 77
## 4           eng_4                 62
## 5           eng_5                 69
## 6         average                 77
#visualize the favorability scores




# Distinguish the average row from other item rows
favorability_score_with_average_percent$highlight <- ifelse(favorability_score_with_average_percent$question_number == favorability_score_with_average_percent$question_number[6], "first", "other")


# graph showing engagement factor scores for the 2-4 years tenure group
ggplot(favorability_score_with_average_percent, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Employees with 2-4 years Tenure") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  

#append this group's engagement factor favorability score to the tenure group df
engagement_by_tenure <- rbind(engagement_by_tenure, average_row)
engagement_by_tenure$question_number[engagement_by_tenure$question_number == "average"] <- "2-4 Years"



3.2.6 Engagement factor scores for employees whose tenure are “4-6 years”



n = 274

#4-6 years
df_2[df_2$tenure_group == "4-6 years" & !is.na(df_2$tenure_group), ]
# Calculate favorability_score for columns 8 to 12 (engagement questions)
favorability_score <- function_favorability(df_2[df_2$tenure_group == "4-6 years", 8:12])

colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1          0.9014599
## 2           eng_2          0.8534799
## 3           eng_3          0.7867647
## 4           eng_4          0.6300366
## 5           eng_5          0.6167883
# Calculate the average engagement factor favorable score for those with 4-6 years tenure
average_score <- mean(favorability_score$favorability_score, na.rm = TRUE)


average_row <- data.frame(
  question_number = "average",  
  favorability_score = average_score
)

favorability_score_with_average <- rbind(favorability_score, average_row)
print(favorability_score_with_average)
##   question_number favorability_score
## 1           eng_1          0.9014599
## 2           eng_2          0.8534799
## 3           eng_3          0.7867647
## 4           eng_4          0.6300366
## 5           eng_5          0.6167883
## 6         average          0.7577059
favorability_score_with_average_percent <- favorability_score_with_average
favorability_score_with_average_percent$favorability_score <- round(favorability_score_with_average_percent $favorability_score,2)*100

print(favorability_score_with_average_percent)
##   question_number favorability_score
## 1           eng_1                 90
## 2           eng_2                 85
## 3           eng_3                 79
## 4           eng_4                 63
## 5           eng_5                 62
## 6         average                 76
# Distinguish the average row from other item rows
favorability_score_with_average_percent$highlight <- ifelse(favorability_score_with_average_percent$question_number == favorability_score_with_average_percent$question_number[6], "first", "other")


# graph showing engagement factor scores for the 4-6 years tenure group
ggplot(favorability_score_with_average_percent, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Employees with 4-6 years Tenure") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  

#append this group's engagement factor favorability score to the tenure group df
engagement_by_tenure <- rbind(engagement_by_tenure, average_row)
engagement_by_tenure$question_number[engagement_by_tenure$question_number == "average"] <- "4-6 Years"



3.2.7 Engagement factor scores for employees whose tenure are “6-10 years”



N = 163

#6-10 years
df_2[df_2$tenure_group == "6-10 years" & !is.na(df_2$tenure_group), ]
# Calculate favorability_score for columns 8 to 12 (engagement questions)
favorability_score <- function_favorability(df_2[df_2$tenure_group == "6-10 years", 8:12])


colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1          0.9202454
## 2           eng_2          0.8588957
## 3           eng_3          0.7361963
## 4           eng_4          0.6604938
## 5           eng_5          0.7080745
# Calculate the average engagement factor favorable score for those with 6-10 years tenure
average_score <- mean(favorability_score$favorability_score, na.rm = TRUE)


average_row <- data.frame(
  question_number = "average",  
  favorability_score = average_score
)

favorability_score_with_average <- rbind(favorability_score, average_row)
print(favorability_score_with_average)
##   question_number favorability_score
## 1           eng_1          0.9202454
## 2           eng_2          0.8588957
## 3           eng_3          0.7361963
## 4           eng_4          0.6604938
## 5           eng_5          0.7080745
## 6         average          0.7767812
favorability_score_with_average_percent <- favorability_score_with_average
favorability_score_with_average_percent$favorability_score <- round(favorability_score_with_average_percent $favorability_score,2)*100

print(favorability_score_with_average_percent)
##   question_number favorability_score
## 1           eng_1                 92
## 2           eng_2                 86
## 3           eng_3                 74
## 4           eng_4                 66
## 5           eng_5                 71
## 6         average                 78
# Distinguish the average row from other item rows
favorability_score_with_average_percent$highlight <- ifelse(favorability_score_with_average_percent$question_number == favorability_score_with_average_percent$question_number[6], "first", "other")


# graph showing engagement factor scores for the 6-10 years tenure group
ggplot(favorability_score_with_average_percent, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Employees with 6-10 years Tenure") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  

#append this group's engagement factor favorability score to the tenure group df
engagement_by_tenure <- rbind(engagement_by_tenure, average_row)
engagement_by_tenure$question_number[engagement_by_tenure$question_number == "average"] <- "6-10 Years"



3.2.8 Engagement factor scores for employees whose tenure are “10+ years”



n = 4, given that this category has less than 5 employees, we don’t report it.

#it is critical that there are only 2 employees who have worked longer than 10 years.

#10+ years
df_2[df_2$tenure_group == "10+ years" & !is.na(df_2$tenure_group), ]
# Calculate favorability_score for columns 8 to 12 (engagement questions)
favorability_score <- function_favorability(df_2[df_2$tenure_group == "10+ years", 8:12])

colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1               1.00
## 2           eng_2               1.00
## 3           eng_3               1.00
## 4           eng_4               1.00
## 5           eng_5               0.75
# Calculate the average engagement factor favorable score for those with over 10 years tenure
average_score <- mean(favorability_score$favorability_score, na.rm = TRUE)


average_row <- data.frame(
  question_number = "average",  
  favorability_score = average_score
)

favorability_score_with_average <- rbind(favorability_score, average_row)
print(favorability_score_with_average)
##   question_number favorability_score
## 1           eng_1               1.00
## 2           eng_2               1.00
## 3           eng_3               1.00
## 4           eng_4               1.00
## 5           eng_5               0.75
## 6         average               0.95
favorability_score_with_average_percent <- favorability_score_with_average
favorability_score_with_average_percent$favorability_score <- round(favorability_score_with_average_percent $favorability_score,2)*100

print(favorability_score_with_average_percent)
##   question_number favorability_score
## 1           eng_1                100
## 2           eng_2                100
## 3           eng_3                100
## 4           eng_4                100
## 5           eng_5                 75
## 6         average                 95
#visualize the favorability scores


# Distinguish the average row from other item rows
favorability_score_with_average_percent$highlight <- ifelse(favorability_score_with_average_percent$question_number == favorability_score_with_average_percent$question_number[6], "first", "other")

ggplot(favorability_score_with_average_percent, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE, width = 0.6) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Favorability Scores for Employees with Over 10 Years Tenure") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 0, hjust = 1), 
    plot.title = element_text(hjust = 0.5)  
  ) +
   geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5)



3.2.9 Engagement factor favorable scores across tenure groups (employee life cycle)



engagement_by_tenure$favorability_score <- round(engagement_by_tenure$favorability_score*100)
engagement_by_tenure$question_number <- factor(engagement_by_tenure$question_number, levels = engagement_by_tenure$question_number)

engagement_by_tenure
# Distinguish the average row from other item rows
engagement_by_tenure$highlight <- ifelse(engagement_by_tenure$question_number == engagement_by_tenure$question_number[1], "first", "other")


# graph showing engagement factor scores across tenure groups 
ggplot(engagement_by_tenure, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores Across Tenure Groups") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 11),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  



3.2.10 Changes in employee engagement favorable scores over employee life cycle (line graphs)



library(dplyr)
library(ggplot2)
library(scales)

table(df_2$tenure_group)
## 
##      1-2 years      10+ years      2-4 years     3-6 months      4-6 years 
##            670              4            644            325            274 
##     6-10 years    6-12 months Under 3 months 
##            163            507             64
#List of tenure groups
tenure_groups <- c("Under 3 months", "3-6 months", "6-12 months", "1-2 years", "2-4 years", "4-6 years", "6-10 years")

#Create an empty data frame to store the consolidated results
consolidated_favorability_scores <- data.frame(
  tenure_group = character(),
  question_number = character(),
  favorability_score = numeric(),
  stringsAsFactors = FALSE
)

#Loop through each tenure group and calculate favorability scores
for (group in tenure_groups) {
  #Calculate favorability scores
  favorability_score <- function_favorability(df_2[df_2$tenure_group == group, c(8:12)])  # Example column range
  colnames(favorability_score)[1] <- "question_number"
  
  #Calculate the average favorability score
  average_score <- mean(favorability_score$favorability_score, na.rm = TRUE)
  
  #Create the "average" row
  average_row <- data.frame(
    question_number = "average",  
    favorability_score = average_score
  )
  
  #Add the "average" row to the data frame
  favorability_score <- rbind(favorability_score, average_row)
  
  #Add the tenure group to the data frame
  favorability_score <- favorability_score %>%
    mutate(tenure_group = group)  # Label with the tenure group
  
  #Append to the consolidated data frame
  consolidated_favorability_scores <- rbind(
    consolidated_favorability_scores,
    favorability_score
  )
}

#set the order for tenure grouops in the new df
consolidated_favorability_scores <- consolidated_favorability_scores %>%
  mutate(tenure_group = factor(tenure_group, levels = tenure_groups))  # Set the levels


print(consolidated_favorability_scores)
##    question_number favorability_score   tenure_group
## 1            eng_1          0.9843750 Under 3 months
## 2            eng_2          0.8593750 Under 3 months
## 3            eng_3          0.9062500 Under 3 months
## 4            eng_4          0.8906250 Under 3 months
## 5            eng_5          0.8281250 Under 3 months
## 6          average          0.8937500 Under 3 months
## 7            eng_1          0.9506173     3-6 months
## 8            eng_2          0.9040248     3-6 months
## 9            eng_3          0.8796296     3-6 months
## 10           eng_4          0.8209877     3-6 months
## 11           eng_5          0.8580247     3-6 months
## 12         average          0.8826568     3-6 months
## 13           eng_1          0.9090909    6-12 months
## 14           eng_2          0.8106509    6-12 months
## 15           eng_3          0.8134921    6-12 months
## 16           eng_4          0.7199211    6-12 months
## 17           eng_5          0.7312253    6-12 months
## 18         average          0.7968761    6-12 months
## 19           eng_1          0.9235382      1-2 years
## 20           eng_2          0.8579970      1-2 years
## 21           eng_3          0.7976012      1-2 years
## 22           eng_4          0.6616766      1-2 years
## 23           eng_5          0.7140719      1-2 years
## 24         average          0.7909770      1-2 years
## 25           eng_1          0.9144635      2-4 years
## 26           eng_2          0.8429238      2-4 years
## 27           eng_3          0.7659906      2-4 years
## 28           eng_4          0.6209048      2-4 years
## 29           eng_5          0.6931464      2-4 years
## 30         average          0.7674858      2-4 years
## 31           eng_1          0.9014599      4-6 years
## 32           eng_2          0.8534799      4-6 years
## 33           eng_3          0.7867647      4-6 years
## 34           eng_4          0.6300366      4-6 years
## 35           eng_5          0.6167883      4-6 years
## 36         average          0.7577059      4-6 years
## 37           eng_1          0.9202454     6-10 years
## 38           eng_2          0.8588957     6-10 years
## 39           eng_3          0.7361963     6-10 years
## 40           eng_4          0.6604938     6-10 years
## 41           eng_5          0.7080745     6-10 years
## 42         average          0.7767812     6-10 years
# line graph showing each engagement factor's change over time
ggplot(consolidated_favorability_scores, aes(x = tenure_group, y = favorability_score * 100, group = question_number, color = question_number)) +
  geom_line(size = 1.2) + 
  geom_point(size = 3) +  
  labs(
    x = "Tenure Group",
    y = "Favorability Score (%)",
    title = "Change in Engagement Favorable Scores Across Employee Lifecycle",
    caption = "Favorability score represents the percentage of respondents who agreed or strongly agreed with the question item over all responses.", 
    color = "Engagement Questions"  
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
  plot.caption = element_text(size = 6, hjust = 0)    
  ) +
  scale_y_continuous(labels = percent_format(scale = 1))  

#create another line graph in which I emphasize average, eng_4, and eng_5


# Assign colors or line styles to emphasize specific lines
line_colors <- c(
  "average" = "red",  # Emphasize the "average" line with red color
  "eng_4" = "blue",  # Emphasize "eng_4" with blue
  "eng_5" = "orange",  # Emphasize "eng_5" with orange
  "other" = "gray"  # Use a neutral color for other lines
)

# Create a line plot with customized colors for emphasis
ggplot(consolidated_favorability_scores, aes(x = tenure_group, y = favorability_score * 100, group = question_number)) +
  geom_line(aes(color = ifelse(question_number %in% c("average", "eng_4", "eng_5"), question_number, "other")), size = 1.2) +  # Draw lines with custom colors
  geom_point(aes(color = ifelse(question_number %in% c("average", "eng_4", "eng_5"), question_number, "other")), size = 3) +  # Add points with matching colors
  labs(
    x = "Tenure Group",
    y = "Favorability Score (%)",
    title = "Change in Engagement Favorability Scores Across Employee Lifecycle",
    caption = "Favorability score represents the percentage of respondents who agreed or strongly agreed with the question item over all responses.",  # Detailed caption
    color = "Engagement Questions"  # Custom legend title
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  # Rotate x-axis labels
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
  plot.caption = element_text(size = 6, hjust = 0)    
  ) +
  scale_y_continuous(labels = percent_format(scale = 1)) +  # Display y-axis as percentage
  scale_color_manual(values = line_colors)  # Assign specific colors to lines

#Create an empty data frame to store the consolidated results for men and women separately

consolidated_favorability_scores_men <- data.frame(
  tenure_group = character(),
  question_number = character(),
  favorability_score = numeric(),
  stringsAsFactors = FALSE
)

consolidated_favorability_scores_women <- data.frame(
  tenure_group = character(),
  question_number = character(),
  stringsAsFactors = FALSE
)

# Define tenure groups, excluding '10+ years'
tenure_groups <- c("Under 3 months", "3-6 months", "6-12 months", "1-2 years", "2-4 years", "4-6 years", "6-10 years")

# Function to calculate favorability scores and add tenure groups
calculate_scores_by_tenure <- function(data, tenure_groups) {
  consolidated_scores <- data.frame(
    tenure_group = character(),
    question_number = character(),
    favorability_score = numeric(),
    stringsAsFactors = FALSE
  )

  for (group in tenure_groups) {
    # Calculate favorability scores for each tenure group
    favorability_score <- function_favorability(data[data$tenure_group == group, c(8:12)])  # Example column range
    colnames(favorability_score)[1] <- "question_number"
    
    # Calculate the average favorability score
    average_score <- mean(favorability_score$favorability_score, na.rm = TRUE)
    
    # Create the "average" row
    average_row <- data.frame(
      question_number = "average",
      favorability_score = average_score
    )
    
    # Add the "average" row to the data frame
    favorability_score <- rbind(favorability_score, average_row)
    
    # Add the tenure group to the data frame
    favorability_score <- favorability_score %>%
      mutate(tenure_group = group)  # Label with the tenure group
    
    # Append to the consolidated data frame
    consolidated_scores <- rbind(
      consolidated_scores,
      favorability_score
    )
  }

  # Set the order for tenure groups
  consolidated_scores <- consolidated_scores %>%
    mutate(tenure_group = factor(tenure_group, levels = tenure_groups))
  
  return(consolidated_scores)
}

# Calculate scores for men and women, excluding '10+ years'
consolidated_favorability_scores_men <- calculate_scores_by_tenure(df_2[df_2$gender == "Male" & df_2$tenure_group != "10+ years", ], tenure_groups)
consolidated_favorability_scores_women <- calculate_scores_by_tenure(df_2[df_2$gender == "Female" & df_2$tenure_group != "10+ years", ], tenure_groups)

# Now create the line plots for men and women, excluding '10+ years'
# Line plot for men
ggplot(consolidated_favorability_scores_men, aes(x = tenure_group, y = favorability_score * 100, group = question_number, color = question_number)) +
  geom_line(size = 1.2) +  # Draw lines
  geom_point(size = 3) +  # Add points
  labs(
    x = "Tenure Group",
    y = "Favorability Score (%)",
    title = "Change in Engagement Favorability Scores for Men Across Employee Lifecycle",
     caption = "Favorability score represents the percentage of respondents who agreed or strongly agreed with the question item over all responses.",  # Detailed caption
    color = "Engagement Questions"  # Custom legend title
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 8, hjust = 0)    
  ) +
  scale_y_continuous(labels = percent_format(scale = 1))  # Display y-axis as percentage

# Line plot for men
ggplot(consolidated_favorability_scores_men, aes(x = tenure_group, y = favorability_score * 100, group = question_number, color = question_number)) +
  geom_line(size = 1.2) +  # Draw lines
  geom_point(size = 3) +  # Add points
  labs(
    x = "Tenure Group",
    y = "Favorability Score (%)",
    title = "Change in Engagement Favorability Scores for Men Across Employee Lifecycle",
     caption = "Favorability score represents the percentage of respondents who agreed or strongly agreed with the question item over all responses.",  # Detailed caption
    color = "Engagement Questions"  # Custom legend title
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 8, hjust = 0)    
  ) +
  scale_y_continuous(labels = percent_format(scale = 1)) +  # Display y-axis as percentage
  scale_color_manual(values = line_colors)

# Line plot for women
ggplot(consolidated_favorability_scores_women, aes(x = tenure_group, y = favorability_score * 100, group = question_number, color = question_number)) +
  geom_line(size = 1.2) +  # Draw lines
  geom_point(size = 3) +  # Add points
  labs(
    x = "Tenure Group",
    y = "Favorability Score (%)",
    title = "Change in Engagement Favorability Scores for Women Across Employee Lifecycle",
     caption = "Favorability score represents the percentage of respondents who agreed or strongly agreed with the question item over all responses.",  # Detailed caption
    color = "Engagement Questions"  # Custom legend title
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 8, hjust = 0)    
  ) +
  scale_y_continuous(labels = percent_format(scale = 1))

# Line plot for women
ggplot(consolidated_favorability_scores_women, aes(x = tenure_group, y = favorability_score * 100, group = question_number, color = question_number)) +
  geom_line(size = 1.2) +  # Draw lines
  geom_point(size = 3) +  # Add points
  labs(
    x = "Tenure Group",
    y = "Favorability Score (%)",
    title = "Change in Engagement Favorability Scores for Women Across Employee Lifecycle",
     caption = "Favorability score represents the percentage of respondents who agreed or strongly agreed with the question item over all responses.",  # Detailed caption
    color = "Engagement Questions"  # Custom legend title
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 8, hjust = 0)    
  ) +
 scale_y_continuous(labels = percent_format(scale = 1)) +  # Display y-axis as percentage
  scale_color_manual(values = line_colors)



3.3 Calculate Favorable Scores for All Questions and Factors by gender



3.3.1 Favorable scores for all questions and factors for women



N for female = 820

table(df_2$gender)
## 
## Female   Male 
##    820   1830
df_2[df_2$gender== "Female" & !is.na(df_2$gender), ]
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$gender == "Female", 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.8441718
## 2            ali_2          0.7980296
## 3            ali_3          0.6584464
## 4            col_1          0.7610294
## 5            col_2          0.6022727
## 6            col_3          0.8204182
## 7            eng_1          0.8974359
## 8            eng_2          0.8095238
## 9            eng_3          0.7533742
## 10           eng_4          0.6328029
## 11           eng_5          0.6813725
## 12           inc_1          0.8620269
## 13           inc_2          0.6492537
## 14           inc_3          0.7382134
## 15           inc_4          0.6509317
## 16           inc_5          0.7719950
## 17           lea_1          0.8019680
## 18           lea_2          0.8009828
## 19           lea_3          0.8946078
## 20           lea_4          0.8378378
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  

library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")


print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 84     Alignment
## 2            ali_2                 80     Alignment
## 3            ali_3                 66     Alignment
## 4            col_1                 76 Collaboration
## 5            col_2                 60 Collaboration
## 6            col_3                 82 Collaboration
## 7            eng_1                 90    Engagement
## 8            eng_2                 81    Engagement
## 9            eng_3                 75    Engagement
## 10           eng_4                 63    Engagement
## 11           eng_5                 68    Engagement
## 12           inc_1                 86     Inclusion
## 13           inc_2                 65     Inclusion
## 14           inc_3                 74     Inclusion
## 15           inc_4                 65     Inclusion
## 16           inc_5                 77     Inclusion
## 17           lea_1                 80    Leadership
## 18           lea_2                 80    Leadership
## 19           lea_3                 89    Leadership
## 20           lea_4                 84    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.7668826
## 2                 col              0.7279068
## 3                 eng              0.7549019
## 4                 inc              0.7344841
## 5                 lea              0.8338491
#change the favorability scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100

library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")


#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 77     Alignment
## 1            ali_1                 84     Alignment
## 2            ali_2                 80     Alignment
## 3            ali_3                 66     Alignment
## 22             col                 73 Collaboration
## 4            col_1                 76 Collaboration
## 5            col_2                 60 Collaboration
## 6            col_3                 82 Collaboration
## 23             eng                 75    Engagement
## 7            eng_1                 90    Engagement
## 8            eng_2                 81    Engagement
## 9            eng_3                 75    Engagement
## 10           eng_4                 63    Engagement
## 11           eng_5                 68    Engagement
## 24             inc                 73     Inclusion
## 12           inc_1                 86     Inclusion
## 13           inc_2                 65     Inclusion
## 14           inc_3                 74     Inclusion
## 15           inc_4                 65     Inclusion
## 16           inc_5                 77     Inclusion
## 25             lea                 83    Leadership
## 17           lea_1                 80    Leadership
## 18           lea_2                 80    Leadership
## 19           lea_3                 89    Leadership
## 20           lea_4                 84    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "Overall",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
#create a graph that shows engagement favorable score across tenure groups (all)

engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

#Create a ggplot that shows engagement factor favorable scores for women

ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Women") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  



3.3.2 Favorable scores for all questions and factors for men



n = 1830

# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$gender == "Male", 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.9053385
## 2            ali_2          0.8484013
## 3            ali_3          0.6958678
## 4            col_1          0.8517298
## 5            col_2          0.6007861
## 6            col_3          0.7991170
## 7            eng_1          0.9315068
## 8            eng_2          0.8691128
## 9            eng_3          0.8222710
## 10           eng_4          0.7090411
## 11           eng_5          0.7404162
## 12           inc_1          0.9111479
## 13           inc_2          0.7589134
## 14           inc_3          0.8450390
## 15           inc_4          0.7216611
## 16           inc_5          0.7761693
## 17           lea_1          0.8475509
## 18           lea_2          0.8706659
## 19           lea_3          0.8950549
## 20           lea_4          0.8949973
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  
print(favorability_score_percent)
##    question_number favorability_score
## 1            ali_1                 91
## 2            ali_2                 85
## 3            ali_3                 70
## 4            col_1                 85
## 5            col_2                 60
## 6            col_3                 80
## 7            eng_1                 93
## 8            eng_2                 87
## 9            eng_3                 82
## 10           eng_4                 71
## 11           eng_5                 74
## 12           inc_1                 91
## 13           inc_2                 76
## 14           inc_3                 85
## 15           inc_4                 72
## 16           inc_5                 78
## 17           lea_1                 85
## 18           lea_2                 87
## 19           lea_3                 90
## 20           lea_4                 89
library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")


print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 91     Alignment
## 2            ali_2                 85     Alignment
## 3            ali_3                 70     Alignment
## 4            col_1                 85 Collaboration
## 5            col_2                 60 Collaboration
## 6            col_3                 80 Collaboration
## 7            eng_1                 93    Engagement
## 8            eng_2                 87    Engagement
## 9            eng_3                 82    Engagement
## 10           eng_4                 71    Engagement
## 11           eng_5                 74    Engagement
## 12           inc_1                 91     Inclusion
## 13           inc_2                 76     Inclusion
## 14           inc_3                 85     Inclusion
## 15           inc_4                 72     Inclusion
## 16           inc_5                 78     Inclusion
## 17           lea_1                 85    Leadership
## 18           lea_2                 87    Leadership
## 19           lea_3                 90    Leadership
## 20           lea_4                 89    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.8165359
## 2                 col              0.7505443
## 3                 eng              0.8144696
## 4                 inc              0.8025861
## 5                 lea              0.8770673
#change the factor favorable scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100

library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")


#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 82     Alignment
## 1            ali_1                 91     Alignment
## 2            ali_2                 85     Alignment
## 3            ali_3                 70     Alignment
## 22             col                 75 Collaboration
## 4            col_1                 85 Collaboration
## 5            col_2                 60 Collaboration
## 6            col_3                 80 Collaboration
## 23             eng                 81    Engagement
## 7            eng_1                 93    Engagement
## 8            eng_2                 87    Engagement
## 9            eng_3                 82    Engagement
## 10           eng_4                 71    Engagement
## 11           eng_5                 74    Engagement
## 24             inc                 80     Inclusion
## 12           inc_1                 91     Inclusion
## 13           inc_2                 76     Inclusion
## 14           inc_3                 85     Inclusion
## 15           inc_4                 72     Inclusion
## 16           inc_5                 78     Inclusion
## 25             lea                 88    Leadership
## 17           lea_1                 85    Leadership
## 18           lea_2                 87    Leadership
## 19           lea_3                 90    Leadership
## 20           lea_4                 89    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores 

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "Overall",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
#create a graph that shows engagement favorable score across tenure groups (all)

engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

# create a graph that shows engagement factor favorable scores for men 
ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Men") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  



3.3.3 a graph comparing enagement factor favorable scores between men and women



library(ggplot2)
library(dplyr)

# Calculate engagement favorability scores for all employees
all_employees_favorability_score <- function_favorability(df_2[, 8:12])
all_employees_favorability_score$gender <- "All Employees"  # Label as 'All Employees'

# Calculate engagement favorability scores for men
men_df <- df_2[df_2$gender == "Male", ]
men_favorability_score <- function_favorability(men_df[, 8:12])
men_favorability_score$gender <- "Men"  # Label as 'Men'

# Calculate engagement favorability scores for women
women_df <- df_2[df_2$gender == "Female", ]
women_favorability_score <- function_favorability(women_df[, 8:12])
women_favorability_score$gender = "Women"  # Label as 'Women'

# Combine all the scores into a single data frame
combined_favorability_scores <- rbind(
  all_employees_favorability_score,
  men_favorability_score,
  women_favorability_score
)

# Convert favorability scores to percentages
combined_favorability_scores$favorability_score <- combined_favorability_scores$favorability_score * 100

# Rename the first column to "question_number"
colnames(combined_favorability_scores)[1] = "question_number"

# Create a bar graph with ggplot2
ggplot(combined_favorability_scores, aes(x = question_number, y = favorability_score, fill = gender)) +
  geom_bar(stat = "identity", position = "dodge") +  # Bars side-by-side
  labs(
    x = "Engagement Question",
    y = "Favorability Score (%)",
    title = "Engagement Favorability Scores by Gender",
    caption = "Favorability score represents the percentage of respondents who agreed or strongly agreed with the question item over all responses."
    )+
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 1),  # Rotate x-axis labels
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
  plot.caption = element_text(size = 6, hjust = 0)    # Style the title
  ) +
  geom_text(aes(label = paste0(round(favorability_score, 1), "%")),  # Add percentage labels
            position = position_dodge(width = 0.9),
            vjust = -0.5,
            size = 3.5) +  # Adjust text size
  scale_fill_manual(  # Custom colors for gender
    values = c("Men" = "skyblue", "Women" = "purple", "All Employees" = "gray")
  )





3.3.4 Favorable scores for all questions for women for Under 3 months



n = 25

df_2[
  df_2$gender == "Female" & 
  df_2$tenure_group == "Under 3 months" & 
  !is.na(df_2$gender), ]
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
  df_2$gender == "Female" & 
  df_2$tenure_group == "Under 3 months" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.6000000
## 2            ali_2          0.8800000
## 3            ali_3          0.6800000
## 4            col_1          0.8800000
## 5            col_2          0.6500000
## 6            col_3          0.7916667
## 7            eng_1          0.9600000
## 8            eng_2          0.8000000
## 9            eng_3          0.9200000
## 10           eng_4          0.9600000
## 11           eng_5          0.8000000
## 12           inc_1          0.9200000
## 13           inc_2          0.7826087
## 14           inc_3          0.9200000
## 15           inc_4          0.7600000
## 16           inc_5          0.9200000
## 17           lea_1          0.9600000
## 18           lea_2          0.9200000
## 19           lea_3          0.9600000
## 20           lea_4          0.9600000
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  

library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")



print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 60     Alignment
## 2            ali_2                 88     Alignment
## 3            ali_3                 68     Alignment
## 4            col_1                 88 Collaboration
## 5            col_2                 65 Collaboration
## 6            col_3                 79 Collaboration
## 7            eng_1                 96    Engagement
## 8            eng_2                 80    Engagement
## 9            eng_3                 92    Engagement
## 10           eng_4                 96    Engagement
## 11           eng_5                 80    Engagement
## 12           inc_1                 92     Inclusion
## 13           inc_2                 78     Inclusion
## 14           inc_3                 92     Inclusion
## 15           inc_4                 76     Inclusion
## 16           inc_5                 92     Inclusion
## 17           lea_1                 96    Leadership
## 18           lea_2                 92    Leadership
## 19           lea_3                 96    Leadership
## 20           lea_4                 96    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.7200000
## 2                 col              0.7738889
## 3                 eng              0.8880000
## 4                 inc              0.8605217
## 5                 lea              0.9500000
#change the factor favorable scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100

library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")



#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 72     Alignment
## 1            ali_1                 60     Alignment
## 2            ali_2                 88     Alignment
## 3            ali_3                 68     Alignment
## 22             col                 77 Collaboration
## 4            col_1                 88 Collaboration
## 5            col_2                 65 Collaboration
## 6            col_3                 79 Collaboration
## 23             eng                 89    Engagement
## 7            eng_1                 96    Engagement
## 8            eng_2                 80    Engagement
## 9            eng_3                 92    Engagement
## 10           eng_4                 96    Engagement
## 11           eng_5                 80    Engagement
## 24             inc                 86     Inclusion
## 12           inc_1                 92     Inclusion
## 13           inc_2                 78     Inclusion
## 14           inc_3                 92     Inclusion
## 15           inc_4                 76     Inclusion
## 16           inc_5                 92     Inclusion
## 25             lea                 95    Leadership
## 17           lea_1                 96    Leadership
## 18           lea_2                 92    Leadership
## 19           lea_3                 96    Leadership
## 20           lea_4                 96    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure under 3 Months") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure under 3 Months") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "Under 3 months",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
#create a graph that shows engagement favorable scores for women with tenure under 3 months  

engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Women with Tenure Under 3 Months") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  



3.3.5 Favorable scores for all questions for women with tenure between 3-6 months



n = 116

table(df_2$gender, df_2$tenure_group)
##         
##          1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
##   Female       211         1       191        116        58         37
##   Male         459         3       453        209       216        126
##         
##          6-12 months Under 3 months
##   Female         181             25
##   Male           325             39
df_2[
  df_2$gender == "Female" & 
  df_2$tenure_group == "3-6 months" & 
  !is.na(df_2$gender), ]
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
  df_2$gender == "Female" & 
  df_2$tenure_group == "3-6 months" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.9130435
## 2            ali_2          0.8782609
## 3            ali_3          0.7739130
## 4            col_1          0.8534483
## 5            col_2          0.5904762
## 6            col_3          0.9130435
## 7            eng_1          0.9396552
## 8            eng_2          0.9130435
## 9            eng_3          0.8448276
## 10           eng_4          0.7931034
## 11           eng_5          0.8448276
## 12           inc_1          0.8782609
## 13           inc_2          0.6725664
## 14           inc_3          0.7894737
## 15           inc_4          0.7433628
## 16           inc_5          0.7982456
## 17           lea_1          0.8347826
## 18           lea_2          0.8956522
## 19           lea_3          0.9391304
## 20           lea_4          0.8608696
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  

library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")


print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 91     Alignment
## 2            ali_2                 88     Alignment
## 3            ali_3                 77     Alignment
## 4            col_1                 85 Collaboration
## 5            col_2                 59 Collaboration
## 6            col_3                 91 Collaboration
## 7            eng_1                 94    Engagement
## 8            eng_2                 91    Engagement
## 9            eng_3                 84    Engagement
## 10           eng_4                 79    Engagement
## 11           eng_5                 84    Engagement
## 12           inc_1                 88     Inclusion
## 13           inc_2                 67     Inclusion
## 14           inc_3                 79     Inclusion
## 15           inc_4                 74     Inclusion
## 16           inc_5                 80     Inclusion
## 17           lea_1                 83    Leadership
## 18           lea_2                 90    Leadership
## 19           lea_3                 94    Leadership
## 20           lea_4                 86    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.8550725
## 2                 col              0.7856560
## 3                 eng              0.8670915
## 4                 inc              0.7763819
## 5                 lea              0.8826087
#change the factor favorable scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100

library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")



#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 86     Alignment
## 1            ali_1                 91     Alignment
## 2            ali_2                 88     Alignment
## 3            ali_3                 77     Alignment
## 22             col                 79 Collaboration
## 4            col_1                 85 Collaboration
## 5            col_2                 59 Collaboration
## 6            col_3                 91 Collaboration
## 23             eng                 87    Engagement
## 7            eng_1                 94    Engagement
## 8            eng_2                 91    Engagement
## 9            eng_3                 84    Engagement
## 10           eng_4                 79    Engagement
## 11           eng_5                 84    Engagement
## 24             inc                 78     Inclusion
## 12           inc_1                 88     Inclusion
## 13           inc_2                 67     Inclusion
## 14           inc_3                 79     Inclusion
## 15           inc_4                 74     Inclusion
## 16           inc_5                 80     Inclusion
## 25             lea                 88    Leadership
## 17           lea_1                 83    Leadership
## 18           lea_2                 90    Leadership
## 19           lea_3                 94    Leadership
## 20           lea_4                 86    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 3-6 months") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 3-6 months") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "3-6 months",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Women with Tenure between 3-6 Months") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  



3.3.6 Favorable scores for all questions for women with tenure between 6-12 months



n = 181

table(df_2$gender, df_2$tenure_group)
##         
##          1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
##   Female       211         1       191        116        58         37
##   Male         459         3       453        209       216        126
##         
##          6-12 months Under 3 months
##   Female         181             25
##   Male           325             39
df_2[
  df_2$gender == "Female" & 
  df_2$tenure_group == "6-12 months" & 
  !is.na(df_2$gender), ]
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
  df_2$gender == "Female" & 
  df_2$tenure_group == "6-12 months" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.8166667
## 2            ali_2          0.7777778
## 3            ali_3          0.7111111
## 4            col_1          0.7262570
## 5            col_2          0.5674157
## 6            col_3          0.7734807
## 7            eng_1          0.8950276
## 8            eng_2          0.7734807
## 9            eng_3          0.7932961
## 10           eng_4          0.7071823
## 11           eng_5          0.6944444
## 12           inc_1          0.8618785
## 13           inc_2          0.6944444
## 14           inc_3          0.8100559
## 15           inc_4          0.6815642
## 16           inc_5          0.7888889
## 17           lea_1          0.7944444
## 18           lea_2          0.8333333
## 19           lea_3          0.8674033
## 20           lea_4          0.8287293
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  


library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")



print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 82     Alignment
## 2            ali_2                 78     Alignment
## 3            ali_3                 71     Alignment
## 4            col_1                 73 Collaboration
## 5            col_2                 57 Collaboration
## 6            col_3                 77 Collaboration
## 7            eng_1                 90    Engagement
## 8            eng_2                 77    Engagement
## 9            eng_3                 79    Engagement
## 10           eng_4                 71    Engagement
## 11           eng_5                 69    Engagement
## 12           inc_1                 86     Inclusion
## 13           inc_2                 69     Inclusion
## 14           inc_3                 81     Inclusion
## 15           inc_4                 68     Inclusion
## 16           inc_5                 79     Inclusion
## 17           lea_1                 79    Leadership
## 18           lea_2                 83    Leadership
## 19           lea_3                 87    Leadership
## 20           lea_4                 83    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.7685185
## 2                 col              0.6890511
## 3                 eng              0.7726862
## 4                 inc              0.7673664
## 5                 lea              0.8309776
#change the factor favorable scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100

library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")



#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 77     Alignment
## 1            ali_1                 82     Alignment
## 2            ali_2                 78     Alignment
## 3            ali_3                 71     Alignment
## 22             col                 69 Collaboration
## 4            col_1                 73 Collaboration
## 5            col_2                 57 Collaboration
## 6            col_3                 77 Collaboration
## 23             eng                 77    Engagement
## 7            eng_1                 90    Engagement
## 8            eng_2                 77    Engagement
## 9            eng_3                 79    Engagement
## 10           eng_4                 71    Engagement
## 11           eng_5                 69    Engagement
## 24             inc                 77     Inclusion
## 12           inc_1                 86     Inclusion
## 13           inc_2                 69     Inclusion
## 14           inc_3                 81     Inclusion
## 15           inc_4                 68     Inclusion
## 16           inc_5                 79     Inclusion
## 25             lea                 83    Leadership
## 17           lea_1                 79    Leadership
## 18           lea_2                 83    Leadership
## 19           lea_3                 87    Leadership
## 20           lea_4                 83    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 6-12 months") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 6-12 months") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "6-12 months",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Women with Tenure between 6 and 12 Months") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  



3.3.7 Favorable scores for all questions for women with tenure between 1-2 years



n = 211

table(df_2$gender, df_2$tenure_group)
##         
##          1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
##   Female       211         1       191        116        58         37
##   Male         459         3       453        209       216        126
##         
##          6-12 months Under 3 months
##   Female         181             25
##   Male           325             39
df_2[
  df_2$gender == "Female" & 
  df_2$tenure_group == "1-2 years" & 
  !is.na(df_2$gender), ]
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
  df_2$gender == "Female" & 
  df_2$tenure_group == "1-2 years" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.8666667
## 2            ali_2          0.7809524
## 3            ali_3          0.6507177
## 4            col_1          0.7772512
## 5            col_2          0.6201923
## 6            col_3          0.8373206
## 7            eng_1          0.8904762
## 8            eng_2          0.8293839
## 9            eng_3          0.7320574
## 10           eng_4          0.5971564
## 11           eng_5          0.6952381
## 12           inc_1          0.9004739
## 13           inc_2          0.6666667
## 14           inc_3          0.7464115
## 15           inc_4          0.6411483
## 16           inc_5          0.8038278
## 17           lea_1          0.8173077
## 18           lea_2          0.8104265
## 19           lea_3          0.9238095
## 20           lea_4          0.8619048
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  


library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")



print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 87     Alignment
## 2            ali_2                 78     Alignment
## 3            ali_3                 65     Alignment
## 4            col_1                 78 Collaboration
## 5            col_2                 62 Collaboration
## 6            col_3                 84 Collaboration
## 7            eng_1                 89    Engagement
## 8            eng_2                 83    Engagement
## 9            eng_3                 73    Engagement
## 10           eng_4                 60    Engagement
## 11           eng_5                 70    Engagement
## 12           inc_1                 90     Inclusion
## 13           inc_2                 67     Inclusion
## 14           inc_3                 75     Inclusion
## 15           inc_4                 64     Inclusion
## 16           inc_5                 80     Inclusion
## 17           lea_1                 82    Leadership
## 18           lea_2                 81    Leadership
## 19           lea_3                 92    Leadership
## 20           lea_4                 86    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.7661123
## 2                 col              0.7449214
## 3                 eng              0.7488624
## 4                 inc              0.7517056
## 5                 lea              0.8533621
#change the factor favorable scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100

library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")



#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 77     Alignment
## 1            ali_1                 87     Alignment
## 2            ali_2                 78     Alignment
## 3            ali_3                 65     Alignment
## 22             col                 74 Collaboration
## 4            col_1                 78 Collaboration
## 5            col_2                 62 Collaboration
## 6            col_3                 84 Collaboration
## 23             eng                 75    Engagement
## 7            eng_1                 89    Engagement
## 8            eng_2                 83    Engagement
## 9            eng_3                 73    Engagement
## 10           eng_4                 60    Engagement
## 11           eng_5                 70    Engagement
## 24             inc                 75     Inclusion
## 12           inc_1                 90     Inclusion
## 13           inc_2                 67     Inclusion
## 14           inc_3                 75     Inclusion
## 15           inc_4                 64     Inclusion
## 16           inc_5                 80     Inclusion
## 25             lea                 85    Leadership
## 17           lea_1                 82    Leadership
## 18           lea_2                 81    Leadership
## 19           lea_3                 92    Leadership
## 20           lea_4                 86    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 1-2 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 1-2 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "1-2 years",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Women with Tenure between 1-2 Years") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  



3.3.8 Favorable scores for all questions for women with tenure between 2-4 years



n = 191

table(df_2$gender, df_2$tenure_group)
##         
##          1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
##   Female       211         1       191        116        58         37
##   Male         459         3       453        209       216        126
##         
##          6-12 months Under 3 months
##   Female         181             25
##   Male           325             39
df_2[
  df_2$gender == "Female" & 
  df_2$tenure_group == "2-4 years" & 
  !is.na(df_2$gender), ]
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
  df_2$gender == "Female" & 
  df_2$tenure_group == "2-4 years" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.8473684
## 2            ali_2          0.7765957
## 3            ali_3          0.6137566
## 4            col_1          0.7421053
## 5            col_2          0.6170213
## 6            col_3          0.8315789
## 7            eng_1          0.8900524
## 8            eng_2          0.7748691
## 9            eng_3          0.6910995
## 10           eng_4          0.5238095
## 11           eng_5          0.6000000
## 12           inc_1          0.8167539
## 13           inc_2          0.5828877
## 14           inc_3          0.6256684
## 15           inc_4          0.6276596
## 16           inc_5          0.7393617
## 17           lea_1          0.7801047
## 18           lea_2          0.7315789
## 19           lea_3          0.8691099
## 20           lea_4          0.8115183
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  


library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")



print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 85     Alignment
## 2            ali_2                 78     Alignment
## 3            ali_3                 61     Alignment
## 4            col_1                 74 Collaboration
## 5            col_2                 62 Collaboration
## 6            col_3                 83 Collaboration
## 7            eng_1                 89    Engagement
## 8            eng_2                 77    Engagement
## 9            eng_3                 69    Engagement
## 10           eng_4                 52    Engagement
## 11           eng_5                 60    Engagement
## 12           inc_1                 82     Inclusion
## 13           inc_2                 58     Inclusion
## 14           inc_3                 63     Inclusion
## 15           inc_4                 63     Inclusion
## 16           inc_5                 74     Inclusion
## 17           lea_1                 78    Leadership
## 18           lea_2                 73    Leadership
## 19           lea_3                 87    Leadership
## 20           lea_4                 81    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.7459069
## 2                 col              0.7302352
## 3                 eng              0.6959661
## 4                 inc              0.6784663
## 5                 lea              0.7980780
#change the favorability scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100

library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")



#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 75     Alignment
## 1            ali_1                 85     Alignment
## 2            ali_2                 78     Alignment
## 3            ali_3                 61     Alignment
## 22             col                 73 Collaboration
## 4            col_1                 74 Collaboration
## 5            col_2                 62 Collaboration
## 6            col_3                 83 Collaboration
## 23             eng                 70    Engagement
## 7            eng_1                 89    Engagement
## 8            eng_2                 77    Engagement
## 9            eng_3                 69    Engagement
## 10           eng_4                 52    Engagement
## 11           eng_5                 60    Engagement
## 24             inc                 68     Inclusion
## 12           inc_1                 82     Inclusion
## 13           inc_2                 58     Inclusion
## 14           inc_3                 63     Inclusion
## 15           inc_4                 63     Inclusion
## 16           inc_5                 74     Inclusion
## 25             lea                 80    Leadership
## 17           lea_1                 78    Leadership
## 18           lea_2                 73    Leadership
## 19           lea_3                 87    Leadership
## 20           lea_4                 81    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 2-4 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 2-4 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "2-4 years",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Women with Tenure between 2-4 Years") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  



3.3.9 Favorable scores for all questions for women with tenure between 4-6 years



n = 58

table(df_2$gender, df_2$tenure_group)
##         
##          1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
##   Female       211         1       191        116        58         37
##   Male         459         3       453        209       216        126
##         
##          6-12 months Under 3 months
##   Female         181             25
##   Male           325             39
df_2[
  df_2$gender == "Female" & 
  df_2$tenure_group == "4-6 years" & 
  !is.na(df_2$gender), ]
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
  df_2$gender == "Female" & 
  df_2$tenure_group == "4-6 years" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.8448276
## 2            ali_2          0.7894737
## 3            ali_3          0.5357143
## 4            col_1          0.6315789
## 5            col_2          0.6140351
## 6            col_3          0.7142857
## 7            eng_1          0.8620690
## 8            eng_2          0.7931034
## 9            eng_3          0.7368421
## 10           eng_4          0.4655172
## 11           eng_5          0.5000000
## 12           inc_1          0.7931034
## 13           inc_2          0.5964912
## 14           inc_3          0.6607143
## 15           inc_4          0.5818182
## 16           inc_5          0.6428571
## 17           lea_1          0.7321429
## 18           lea_2          0.6607143
## 19           lea_3          0.9107143
## 20           lea_4          0.8181818
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  


library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")



print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 84     Alignment
## 2            ali_2                 79     Alignment
## 3            ali_3                 54     Alignment
## 4            col_1                 63 Collaboration
## 5            col_2                 61 Collaboration
## 6            col_3                 71 Collaboration
## 7            eng_1                 86    Engagement
## 8            eng_2                 79    Engagement
## 9            eng_3                 74    Engagement
## 10           eng_4                 47    Engagement
## 11           eng_5                 50    Engagement
## 12           inc_1                 79     Inclusion
## 13           inc_2                 60     Inclusion
## 14           inc_3                 66     Inclusion
## 15           inc_4                 58     Inclusion
## 16           inc_5                 64     Inclusion
## 17           lea_1                 73    Leadership
## 18           lea_2                 66    Leadership
## 19           lea_3                 91    Leadership
## 20           lea_4                 82    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.7233385
## 2                 col              0.6532999
## 3                 eng              0.6715064
## 4                 inc              0.6549969
## 5                 lea              0.7804383
#change the favorability scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100


library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")



#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 72     Alignment
## 1            ali_1                 84     Alignment
## 2            ali_2                 79     Alignment
## 3            ali_3                 54     Alignment
## 22             col                 65 Collaboration
## 4            col_1                 63 Collaboration
## 5            col_2                 61 Collaboration
## 6            col_3                 71 Collaboration
## 23             eng                 67    Engagement
## 7            eng_1                 86    Engagement
## 8            eng_2                 79    Engagement
## 9            eng_3                 74    Engagement
## 10           eng_4                 47    Engagement
## 11           eng_5                 50    Engagement
## 24             inc                 65     Inclusion
## 12           inc_1                 79     Inclusion
## 13           inc_2                 60     Inclusion
## 14           inc_3                 66     Inclusion
## 15           inc_4                 58     Inclusion
## 16           inc_5                 64     Inclusion
## 25             lea                 78    Leadership
## 17           lea_1                 73    Leadership
## 18           lea_2                 66    Leadership
## 19           lea_3                 91    Leadership
## 20           lea_4                 82    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 4-6 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 4-6 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "4-6 years",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Women with Tenure between 4-6 Years") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  



3.3.10 Favorable scores for all questions for women with tenure between 6-10 years



n = 37

table(df_2$gender, df_2$tenure_group)
##         
##          1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
##   Female       211         1       191        116        58         37
##   Male         459         3       453        209       216        126
##         
##          6-12 months Under 3 months
##   Female         181             25
##   Male           325             39
df_2[
  df_2$gender == "Female" & 
  df_2$tenure_group == "6-10 years" & 
  !is.na(df_2$gender), ]
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
  df_2$gender == "Female" & 
  df_2$tenure_group == "6-10 years" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.7777778
## 2            ali_2          0.8055556
## 3            ali_3          0.4722222
## 4            col_1          0.7567568
## 5            col_2          0.5714286
## 6            col_3          0.7837838
## 7            eng_1          0.8648649
## 8            eng_2          0.7567568
## 9            eng_3          0.6216216
## 10           eng_4          0.5555556
## 11           eng_5          0.6666667
## 12           inc_1          0.9189189
## 13           inc_2          0.6111111
## 14           inc_3          0.7714286
## 15           inc_4          0.4285714
## 16           inc_5          0.6764706
## 17           lea_1          0.7567568
## 18           lea_2          0.7777778
## 19           lea_3          0.7837838
## 20           lea_4          0.7500000
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  


library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")



print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 78     Alignment
## 2            ali_2                 81     Alignment
## 3            ali_3                 47     Alignment
## 4            col_1                 76 Collaboration
## 5            col_2                 57 Collaboration
## 6            col_3                 78 Collaboration
## 7            eng_1                 86    Engagement
## 8            eng_2                 76    Engagement
## 9            eng_3                 62    Engagement
## 10           eng_4                 56    Engagement
## 11           eng_5                 67    Engagement
## 12           inc_1                 92     Inclusion
## 13           inc_2                 61     Inclusion
## 14           inc_3                 77     Inclusion
## 15           inc_4                 43     Inclusion
## 16           inc_5                 68     Inclusion
## 17           lea_1                 76    Leadership
## 18           lea_2                 78    Leadership
## 19           lea_3                 78    Leadership
## 20           lea_4                 75    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.6851852
## 2                 col              0.7039897
## 3                 eng              0.6930931
## 4                 inc              0.6813001
## 5                 lea              0.7670796
#change the factor favorable scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100

library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")



#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 69     Alignment
## 1            ali_1                 78     Alignment
## 2            ali_2                 81     Alignment
## 3            ali_3                 47     Alignment
## 22             col                 70 Collaboration
## 4            col_1                 76 Collaboration
## 5            col_2                 57 Collaboration
## 6            col_3                 78 Collaboration
## 23             eng                 69    Engagement
## 7            eng_1                 86    Engagement
## 8            eng_2                 76    Engagement
## 9            eng_3                 62    Engagement
## 10           eng_4                 56    Engagement
## 11           eng_5                 67    Engagement
## 24             inc                 68     Inclusion
## 12           inc_1                 92     Inclusion
## 13           inc_2                 61     Inclusion
## 14           inc_3                 77     Inclusion
## 15           inc_4                 43     Inclusion
## 16           inc_5                 68     Inclusion
## 25             lea                 77    Leadership
## 17           lea_1                 76    Leadership
## 18           lea_2                 78    Leadership
## 19           lea_3                 78    Leadership
## 20           lea_4                 75    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 6-10 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 6-10 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "6-10 years",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Women with Tenure between 6-10 Years") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  



3.3.11 Favorable scores for all questions for men for Under 3 months



n = 39

df_2[
  df_2$gender == "Male" & 
  df_2$tenure_group == "Under 3 months" & 
  !is.na(df_2$gender), ]
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
  df_2$gender == "Male" & 
  df_2$tenure_group == "Under 3 months" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.8461538
## 2            ali_2          0.9487179
## 3            ali_3          0.7948718
## 4            col_1          0.8717949
## 5            col_2          0.5142857
## 6            col_3          0.8717949
## 7            eng_1          1.0000000
## 8            eng_2          0.8974359
## 9            eng_3          0.8974359
## 10           eng_4          0.8461538
## 11           eng_5          0.8461538
## 12           inc_1          0.9487179
## 13           inc_2          0.7222222
## 14           inc_3          0.9210526
## 15           inc_4          0.7777778
## 16           inc_5          0.8974359
## 17           lea_1          0.7948718
## 18           lea_2          0.8974359
## 19           lea_3          0.9230769
## 20           lea_4          0.8205128
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  


library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")



print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 85     Alignment
## 2            ali_2                 95     Alignment
## 3            ali_3                 79     Alignment
## 4            col_1                 87 Collaboration
## 5            col_2                 51 Collaboration
## 6            col_3                 87 Collaboration
## 7            eng_1                100    Engagement
## 8            eng_2                 90    Engagement
## 9            eng_3                 90    Engagement
## 10           eng_4                 85    Engagement
## 11           eng_5                 85    Engagement
## 12           inc_1                 95     Inclusion
## 13           inc_2                 72     Inclusion
## 14           inc_3                 92     Inclusion
## 15           inc_4                 78     Inclusion
## 16           inc_5                 90     Inclusion
## 17           lea_1                 79    Leadership
## 18           lea_2                 90    Leadership
## 19           lea_3                 92    Leadership
## 20           lea_4                 82    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.8632479
## 2                 col              0.7526252
## 3                 eng              0.8974359
## 4                 inc              0.8534413
## 5                 lea              0.8589744
#change the factor favorable scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100

library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")

#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 86     Alignment
## 1            ali_1                 85     Alignment
## 2            ali_2                 95     Alignment
## 3            ali_3                 79     Alignment
## 22             col                 75 Collaboration
## 4            col_1                 87 Collaboration
## 5            col_2                 51 Collaboration
## 6            col_3                 87 Collaboration
## 23             eng                 90    Engagement
## 7            eng_1                100    Engagement
## 8            eng_2                 90    Engagement
## 9            eng_3                 90    Engagement
## 10           eng_4                 85    Engagement
## 11           eng_5                 85    Engagement
## 24             inc                 85     Inclusion
## 12           inc_1                 95     Inclusion
## 13           inc_2                 72     Inclusion
## 14           inc_3                 92     Inclusion
## 15           inc_4                 78     Inclusion
## 16           inc_5                 90     Inclusion
## 25             lea                 86    Leadership
## 17           lea_1                 79    Leadership
## 18           lea_2                 90    Leadership
## 19           lea_3                 92    Leadership
## 20           lea_4                 82    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure under 3 Months") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure under 3 Months") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "Under 3 months",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
#create a graph that shows engagement favorable score across tenure groups 

engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Men with Tenure Under 3 Months") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  



3.3.12 Favorable scores for all questions for men with tenure between 3-6 months



n = 209

table(df_2$gender, df_2$tenure_group)
##         
##          1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
##   Female       211         1       191        116        58         37
##   Male         459         3       453        209       216        126
##         
##          6-12 months Under 3 months
##   Female         181             25
##   Male           325             39
df_2[
  df_2$gender == "Male" & 
  df_2$tenure_group == "3-6 months" & 
  !is.na(df_2$gender), ]
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
  df_2$gender == "Male" & 
  df_2$tenure_group == "3-6 months" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.9320388
## 2            ali_2          0.8888889
## 3            ali_3          0.7584541
## 4            col_1          0.9033816
## 5            col_2          0.5322581
## 6            col_3          0.8689320
## 7            eng_1          0.9567308
## 8            eng_2          0.8990385
## 9            eng_3          0.8990385
## 10           eng_4          0.8365385
## 11           eng_5          0.8653846
## 12           inc_1          0.9268293
## 13           inc_2          0.7587940
## 14           inc_3          0.8712871
## 15           inc_4          0.7450000
## 16           inc_5          0.7980296
## 17           lea_1          0.8743961
## 18           lea_2          0.9077670
## 19           lea_3          0.9275362
## 20           lea_4          0.9275362
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  


library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")



print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 93     Alignment
## 2            ali_2                 89     Alignment
## 3            ali_3                 76     Alignment
## 4            col_1                 90 Collaboration
## 5            col_2                 53 Collaboration
## 6            col_3                 87 Collaboration
## 7            eng_1                 96    Engagement
## 8            eng_2                 90    Engagement
## 9            eng_3                 90    Engagement
## 10           eng_4                 84    Engagement
## 11           eng_5                 87    Engagement
## 12           inc_1                 93     Inclusion
## 13           inc_2                 76     Inclusion
## 14           inc_3                 87     Inclusion
## 15           inc_4                 74     Inclusion
## 16           inc_5                 80     Inclusion
## 17           lea_1                 87    Leadership
## 18           lea_2                 91    Leadership
## 19           lea_3                 93    Leadership
## 20           lea_4                 93    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.8597939
## 2                 col              0.7681906
## 3                 eng              0.8913462
## 4                 inc              0.8199880
## 5                 lea              0.9093089
#change the favorability scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100


library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")



#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 86     Alignment
## 1            ali_1                 93     Alignment
## 2            ali_2                 89     Alignment
## 3            ali_3                 76     Alignment
## 22             col                 77 Collaboration
## 4            col_1                 90 Collaboration
## 5            col_2                 53 Collaboration
## 6            col_3                 87 Collaboration
## 23             eng                 89    Engagement
## 7            eng_1                 96    Engagement
## 8            eng_2                 90    Engagement
## 9            eng_3                 90    Engagement
## 10           eng_4                 84    Engagement
## 11           eng_5                 87    Engagement
## 24             inc                 82     Inclusion
## 12           inc_1                 93     Inclusion
## 13           inc_2                 76     Inclusion
## 14           inc_3                 87     Inclusion
## 15           inc_4                 74     Inclusion
## 16           inc_5                 80     Inclusion
## 25             lea                 91    Leadership
## 17           lea_1                 87    Leadership
## 18           lea_2                 91    Leadership
## 19           lea_3                 93    Leadership
## 20           lea_4                 93    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 3-6 months") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 3-6 months") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "3-6 months",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Men with Tenure between 3-6 Months") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  



3.3.13 Favorable scores for all questions for men with tenure between 6-12 months



n = 325

table(df_2$gender, df_2$tenure_group)
##         
##          1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
##   Female       211         1       191        116        58         37
##   Male         459         3       453        209       216        126
##         
##          6-12 months Under 3 months
##   Female         181             25
##   Male           325             39
df_2[
  df_2$gender == "Male" & 
  df_2$tenure_group == "6-12 months" & 
  !is.na(df_2$gender), ]
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
  df_2$gender == "Male" & 
  df_2$tenure_group == "6-12 months" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.8981481
## 2            ali_2          0.8452012
## 3            ali_3          0.6677019
## 4            col_1          0.8664596
## 5            col_2          0.5460317
## 6            col_3          0.8204334
## 7            eng_1          0.9166667
## 8            eng_2          0.8307692
## 9            eng_3          0.8240741
## 10           eng_4          0.7292308
## 11           eng_5          0.7507692
## 12           inc_1          0.9068323
## 13           inc_2          0.7746032
## 14           inc_3          0.8593750
## 15           inc_4          0.6813880
## 16           inc_5          0.7875000
## 17           lea_1          0.8456790
## 18           lea_2          0.8641975
## 19           lea_3          0.9138462
## 20           lea_4          0.8947368
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  


library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")



print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 90     Alignment
## 2            ali_2                 85     Alignment
## 3            ali_3                 67     Alignment
## 4            col_1                 87 Collaboration
## 5            col_2                 55 Collaboration
## 6            col_3                 82 Collaboration
## 7            eng_1                 92    Engagement
## 8            eng_2                 83    Engagement
## 9            eng_3                 82    Engagement
## 10           eng_4                 73    Engagement
## 11           eng_5                 75    Engagement
## 12           inc_1                 91     Inclusion
## 13           inc_2                 77     Inclusion
## 14           inc_3                 86     Inclusion
## 15           inc_4                 68     Inclusion
## 16           inc_5                 79     Inclusion
## 17           lea_1                 85    Leadership
## 18           lea_2                 86    Leadership
## 19           lea_3                 91    Leadership
## 20           lea_4                 89    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.8036837
## 2                 col              0.7443083
## 3                 eng              0.8103020
## 4                 inc              0.8019397
## 5                 lea              0.8796149
#change the factor favorable scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100

library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")



#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 80     Alignment
## 1            ali_1                 90     Alignment
## 2            ali_2                 85     Alignment
## 3            ali_3                 67     Alignment
## 22             col                 74 Collaboration
## 4            col_1                 87 Collaboration
## 5            col_2                 55 Collaboration
## 6            col_3                 82 Collaboration
## 23             eng                 81    Engagement
## 7            eng_1                 92    Engagement
## 8            eng_2                 83    Engagement
## 9            eng_3                 82    Engagement
## 10           eng_4                 73    Engagement
## 11           eng_5                 75    Engagement
## 24             inc                 80     Inclusion
## 12           inc_1                 91     Inclusion
## 13           inc_2                 77     Inclusion
## 14           inc_3                 86     Inclusion
## 15           inc_4                 68     Inclusion
## 16           inc_5                 79     Inclusion
## 25             lea                 88    Leadership
## 17           lea_1                 85    Leadership
## 18           lea_2                 86    Leadership
## 19           lea_3                 91    Leadership
## 20           lea_4                 89    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 6-12 months") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 6-12 months") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "6-12 months",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Men with Tenure between 6 and 12 Months") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  



3.3.14 Favorable scores for all questions for men with tenure between 1-2 years



n = 459

table(df_2$gender, df_2$tenure_group)
##         
##          1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
##   Female       211         1       191        116        58         37
##   Male         459         3       453        209       216        126
##         
##          6-12 months Under 3 months
##   Female         181             25
##   Male           325             39
df_2[
  df_2$gender == "Male" & 
  df_2$tenure_group == "1-2 years" & 
  !is.na(df_2$gender), ]
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
  df_2$gender == "Male" & 
  df_2$tenure_group == "1-2 years" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.9102845
## 2            ali_2          0.8231441
## 3            ali_3          0.6710240
## 4            col_1          0.8518519
## 5            col_2          0.6228070
## 6            col_3          0.8358862
## 7            eng_1          0.9387309
## 8            eng_2          0.8711790
## 9            eng_3          0.8275109
## 10           eng_4          0.6914661
## 11           eng_5          0.7227074
## 12           inc_1          0.9080963
## 13           inc_2          0.7550562
## 14           inc_3          0.8505495
## 15           inc_4          0.7676991
## 16           inc_5          0.7802198
## 17           lea_1          0.8769231
## 18           lea_2          0.8796499
## 19           lea_3          0.8903509
## 20           lea_4          0.9082969
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  


library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")



print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 91     Alignment
## 2            ali_2                 82     Alignment
## 3            ali_3                 67     Alignment
## 4            col_1                 85 Collaboration
## 5            col_2                 62 Collaboration
## 6            col_3                 84 Collaboration
## 7            eng_1                 94    Engagement
## 8            eng_2                 87    Engagement
## 9            eng_3                 83    Engagement
## 10           eng_4                 69    Engagement
## 11           eng_5                 72    Engagement
## 12           inc_1                 91     Inclusion
## 13           inc_2                 76     Inclusion
## 14           inc_3                 85     Inclusion
## 15           inc_4                 77     Inclusion
## 16           inc_5                 78     Inclusion
## 17           lea_1                 88    Leadership
## 18           lea_2                 88    Leadership
## 19           lea_3                 89    Leadership
## 20           lea_4                 91    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.8014842
## 2                 col              0.7701817
## 3                 eng              0.8103189
## 4                 inc              0.8123242
## 5                 lea              0.8888052
#change the factor favorable scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100

library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")



#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 80     Alignment
## 1            ali_1                 91     Alignment
## 2            ali_2                 82     Alignment
## 3            ali_3                 67     Alignment
## 22             col                 77 Collaboration
## 4            col_1                 85 Collaboration
## 5            col_2                 62 Collaboration
## 6            col_3                 84 Collaboration
## 23             eng                 81    Engagement
## 7            eng_1                 94    Engagement
## 8            eng_2                 87    Engagement
## 9            eng_3                 83    Engagement
## 10           eng_4                 69    Engagement
## 11           eng_5                 72    Engagement
## 24             inc                 81     Inclusion
## 12           inc_1                 91     Inclusion
## 13           inc_2                 76     Inclusion
## 14           inc_3                 85     Inclusion
## 15           inc_4                 77     Inclusion
## 16           inc_5                 78     Inclusion
## 25             lea                 89    Leadership
## 17           lea_1                 88    Leadership
## 18           lea_2                 88    Leadership
## 19           lea_3                 89    Leadership
## 20           lea_4                 91    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 1-2 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 1-2 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "1-2 years",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Men with Tenure between 1-2 Years") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  



3.3.15 n. favorable scores for all questions for men with tenure between 2-4 years



n = 453

table(df_2$gender, df_2$tenure_group)
##         
##          1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
##   Female       211         1       191        116        58         37
##   Male         459         3       453        209       216        126
##         
##          6-12 months Under 3 months
##   Female         181             25
##   Male           325             39
df_2[
  df_2$gender == "Male" & 
  df_2$tenure_group == "2-4 years" & 
  !is.na(df_2$gender), ]
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
  df_2$gender == "Male" & 
  df_2$tenure_group == "2-4 years" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.8953229
## 2            ali_2          0.8459821
## 3            ali_3          0.6674107
## 4            col_1          0.8177778
## 5            col_2          0.5915179
## 6            col_3          0.7438753
## 7            eng_1          0.9247788
## 8            eng_2          0.8716814
## 9            eng_3          0.7977778
## 10           eng_4          0.6615044
## 11           eng_5          0.7323009
## 12           inc_1          0.9159292
## 13           inc_2          0.7652370
## 14           inc_3          0.8224719
## 15           inc_4          0.7042889
## 16           inc_5          0.7645740
## 17           lea_1          0.8466667
## 18           lea_2          0.8530067
## 19           lea_3          0.8824834
## 20           lea_4          0.8933333
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100


library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")



print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 90     Alignment
## 2            ali_2                 85     Alignment
## 3            ali_3                 67     Alignment
## 4            col_1                 82 Collaboration
## 5            col_2                 59 Collaboration
## 6            col_3                 74 Collaboration
## 7            eng_1                 92    Engagement
## 8            eng_2                 87    Engagement
## 9            eng_3                 80    Engagement
## 10           eng_4                 66    Engagement
## 11           eng_5                 73    Engagement
## 12           inc_1                 92     Inclusion
## 13           inc_2                 77     Inclusion
## 14           inc_3                 82     Inclusion
## 15           inc_4                 70     Inclusion
## 16           inc_5                 76     Inclusion
## 17           lea_1                 85    Leadership
## 18           lea_2                 85    Leadership
## 19           lea_3                 88    Leadership
## 20           lea_4                 89    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.8029053
## 2                 col              0.7177236
## 3                 eng              0.7976087
## 4                 inc              0.7945002
## 5                 lea              0.8688725
#change the favorability scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100


library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")



#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 80     Alignment
## 1            ali_1                 90     Alignment
## 2            ali_2                 85     Alignment
## 3            ali_3                 67     Alignment
## 22             col                 72 Collaboration
## 4            col_1                 82 Collaboration
## 5            col_2                 59 Collaboration
## 6            col_3                 74 Collaboration
## 23             eng                 80    Engagement
## 7            eng_1                 92    Engagement
## 8            eng_2                 87    Engagement
## 9            eng_3                 80    Engagement
## 10           eng_4                 66    Engagement
## 11           eng_5                 73    Engagement
## 24             inc                 79     Inclusion
## 12           inc_1                 92     Inclusion
## 13           inc_2                 77     Inclusion
## 14           inc_3                 82     Inclusion
## 15           inc_4                 70     Inclusion
## 16           inc_5                 76     Inclusion
## 25             lea                 87    Leadership
## 17           lea_1                 85    Leadership
## 18           lea_2                 85    Leadership
## 19           lea_3                 88    Leadership
## 20           lea_4                 89    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 2-4 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 2-4 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "2-4 years",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Men with Tenure between 2-4 Years") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  



3.3.16 Favorable scores for all questions for men with tenure between 4-6 years



n = 216

table(df_2$gender, df_2$tenure_group)
##         
##          1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
##   Female       211         1       191        116        58         37
##   Male         459         3       453        209       216        126
##         
##          6-12 months Under 3 months
##   Female         181             25
##   Male           325             39
df_2[
  df_2$gender == "Male" & 
  df_2$tenure_group == "4-6 years" & 
  !is.na(df_2$gender), ]
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
  df_2$gender == "Male" & 
  df_2$tenure_group == "4-6 years" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.8930233
## 2            ali_2          0.8262911
## 3            ali_3          0.7289720
## 4            col_1          0.8465116
## 5            col_2          0.6525822
## 6            col_3          0.7428571
## 7            eng_1          0.9120370
## 8            eng_2          0.8697674
## 9            eng_3          0.8000000
## 10           eng_4          0.6744186
## 11           eng_5          0.6481481
## 12           inc_1          0.9052133
## 13           inc_2          0.7572816
## 14           inc_3          0.8564593
## 15           inc_4          0.7439614
## 16           inc_5          0.7403846
## 17           lea_1          0.7943925
## 18           lea_2          0.8651163
## 19           lea_3          0.8925234
## 20           lea_4          0.8738318
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  


library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")



print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 89     Alignment
## 2            ali_2                 83     Alignment
## 3            ali_3                 73     Alignment
## 4            col_1                 85 Collaboration
## 5            col_2                 65 Collaboration
## 6            col_3                 74 Collaboration
## 7            eng_1                 91    Engagement
## 8            eng_2                 87    Engagement
## 9            eng_3                 80    Engagement
## 10           eng_4                 67    Engagement
## 11           eng_5                 65    Engagement
## 12           inc_1                 91     Inclusion
## 13           inc_2                 76     Inclusion
## 14           inc_3                 86     Inclusion
## 15           inc_4                 74     Inclusion
## 16           inc_5                 74     Inclusion
## 17           lea_1                 79    Leadership
## 18           lea_2                 87    Leadership
## 19           lea_3                 89    Leadership
## 20           lea_4                 87    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.8160954
## 2                 col              0.7473170
## 3                 eng              0.7808742
## 4                 inc              0.8006600
## 5                 lea              0.8564660
#change the favorability scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100

library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")



#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 82     Alignment
## 1            ali_1                 89     Alignment
## 2            ali_2                 83     Alignment
## 3            ali_3                 73     Alignment
## 22             col                 75 Collaboration
## 4            col_1                 85 Collaboration
## 5            col_2                 65 Collaboration
## 6            col_3                 74 Collaboration
## 23             eng                 78    Engagement
## 7            eng_1                 91    Engagement
## 8            eng_2                 87    Engagement
## 9            eng_3                 80    Engagement
## 10           eng_4                 67    Engagement
## 11           eng_5                 65    Engagement
## 24             inc                 80     Inclusion
## 12           inc_1                 91     Inclusion
## 13           inc_2                 76     Inclusion
## 14           inc_3                 86     Inclusion
## 15           inc_4                 74     Inclusion
## 16           inc_5                 74     Inclusion
## 25             lea                 86    Leadership
## 17           lea_1                 79    Leadership
## 18           lea_2                 87    Leadership
## 19           lea_3                 89    Leadership
## 20           lea_4                 87    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 4-6 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 4-6 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "4-6 years",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Men with Tenure between 4-6 Years") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  



3.3.17 Favorable scores for all questions for men with tenure between 6-10 years



n = 126

table(df_2$gender, df_2$tenure_group)
##         
##          1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
##   Female       211         1       191        116        58         37
##   Male         459         3       453        209       216        126
##         
##          6-12 months Under 3 months
##   Female         181             25
##   Male           325             39
df_2[
  df_2$gender == "Male" & 
  df_2$tenure_group == "6-10 years" & 
  !is.na(df_2$gender), ]
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
  df_2$gender == "Male" & 
  df_2$tenure_group == "6-10 years" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.9354839
## 2            ali_2          0.8943089
## 3            ali_3          0.7723577
## 4            col_1          0.8492063
## 5            col_2          0.7200000
## 6            col_3          0.7680000
## 7            eng_1          0.9365079
## 8            eng_2          0.8888889
## 9            eng_3          0.7698413
## 10           eng_4          0.6904762
## 11           eng_5          0.7200000
## 12           inc_1          0.8861789
## 13           inc_2          0.7166667
## 14           inc_3          0.7786885
## 15           inc_4          0.6290323
## 16           inc_5          0.7704918
## 17           lea_1          0.8080000
## 18           lea_2          0.8548387
## 19           lea_3          0.8480000
## 20           lea_4          0.8560000
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  


library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")



print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 94     Alignment
## 2            ali_2                 89     Alignment
## 3            ali_3                 77     Alignment
## 4            col_1                 85 Collaboration
## 5            col_2                 72 Collaboration
## 6            col_3                 77 Collaboration
## 7            eng_1                 94    Engagement
## 8            eng_2                 89    Engagement
## 9            eng_3                 77    Engagement
## 10           eng_4                 69    Engagement
## 11           eng_5                 72    Engagement
## 12           inc_1                 89     Inclusion
## 13           inc_2                 72     Inclusion
## 14           inc_3                 78     Inclusion
## 15           inc_4                 63     Inclusion
## 16           inc_5                 77     Inclusion
## 17           lea_1                 81    Leadership
## 18           lea_2                 85    Leadership
## 19           lea_3                 85    Leadership
## 20           lea_4                 86    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.8673835
## 2                 col              0.7790688
## 3                 eng              0.8011429
## 4                 inc              0.7562116
## 5                 lea              0.8417097
#change the favorability scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100

library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")



#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 87     Alignment
## 1            ali_1                 94     Alignment
## 2            ali_2                 89     Alignment
## 3            ali_3                 77     Alignment
## 22             col                 78 Collaboration
## 4            col_1                 85 Collaboration
## 5            col_2                 72 Collaboration
## 6            col_3                 77 Collaboration
## 23             eng                 80    Engagement
## 7            eng_1                 94    Engagement
## 8            eng_2                 89    Engagement
## 9            eng_3                 77    Engagement
## 10           eng_4                 69    Engagement
## 11           eng_5                 72    Engagement
## 24             inc                 76     Inclusion
## 12           inc_1                 89     Inclusion
## 13           inc_2                 72     Inclusion
## 14           inc_3                 78     Inclusion
## 15           inc_4                 63     Inclusion
## 16           inc_5                 77     Inclusion
## 25             lea                 84    Leadership
## 17           lea_1                 81    Leadership
## 18           lea_2                 85    Leadership
## 19           lea_3                 85    Leadership
## 20           lea_4                 86    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 6-10 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 6-10 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "6-10 years",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Men with Tenure between 6-10 Years") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors) 





3.4 Correlate All Non-Engagement Questions to the Engagement Factor for Hooli Overall.



3.4.1 For all employees & all non-engagement items (Kendall’s tau-b & pearson’s r)



The results from the correlations calculated by conducting Kendall’s tau-b and pearson’s r are largley convergent while kendall’s tau-b’s results are much more conservative in terms of the size of the correlations.

#Create a new variable named 'engagement_factor', representing the average engagement scores.
df_2$engagement_factor <- rowMeans(df_2[, c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5")], na.rm = TRUE)

#Select only the numeric columns from 'df_2' excluding 'eng_1' to 'eng_5'
numeric_df_2 <- Filter(is.numeric, df_2)
library(dplyr)
numeric_df_2 <- subset(numeric_df_2, select= -c(eng_1, eng_2, eng_3, eng_4, eng_5))


#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(numeric_df_2, method = "kendall", use = "complete.obs")

#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "engagement_factor"]

engagement_factor_correlation_table_kendall <- data.frame(
  variable = names(engagement_correlations_kendall),
  correlation = engagement_correlations_kendall
)

#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
  engagement_factor_correlation_table_kendall$variable != "engagement_factor", 
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
  arrange(correlation)

#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[13:15, ]

# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
  engagement_factor_correlation_table_kendall$variable,
  levels = engagement_factor_correlation_table_kendall$variable
)

# Use scales package to format correlations as percentages
library(scales)

# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement", caption = "Correlations calculated using Kendall's tau-b"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

library(corrplot)
## corrplot 0.92 loaded
# Recreate the Pearson's r correlation matrix to see if they are too divergent
correlation_matrix <- cor(numeric_df_2, use = "complete.obs")
corrplot(correlation_matrix, type = "upper", order = "hclust")

# Extract correlations for "engagement_factor"
engagement_correlations <- correlation_matrix[, "engagement_factor"]

engagement_factor_correlation_table <- data.frame(
  variable = names(engagement_correlations),
  correlation = engagement_correlations
)

engagement_factor_correlation_table <- engagement_factor_correlation_table[
  engagement_factor_correlation_table$variable != "engagement_factor", 
]

# Sort in descending order to get the strongest correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table %>%
  arrange(correlation)

# Keep only the top 3 correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table[13:15,]

# Reorder the factor levels
engagement_factor_correlation_table$variable <- factor(
  engagement_factor_correlation_table$variable,
  levels = engagement_factor_correlation_table$variable
)


# Create a ggplot with horizontal bars for the top 3 Pearson's r correlations
ggplot(engagement_factor_correlation_table, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement", caption = "Correlations calculated using Pearson's r"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

#engagement factor's correlations with other factors
library(dplyr)

#Create a df with composite factor scores for each group of items
composite_scores <- df_2 %>%
  mutate(
    Alignment = rowMeans(select(., ali_1:ali_3), na.rm = TRUE),  # Alignment score
    Collaboration = rowMeans(select(., col_1:col_3), na.rm = TRUE),  # Collaboration score
    Engagement = rowMeans(select(., eng_1:eng_5), na.rm = TRUE),  # Engagement score
    Inclusion = rowMeans(select(., inc_1:inc_5), na.rm = TRUE),  # Inclusion score
    Leadership = rowMeans(select(., lea_1:lea_4), na.rm = TRUE)  # Leadership score
  ) %>%
  # Select only the composite factor scores and an identifier (if needed)
  select(Alignment, Collaboration, Engagement, Inclusion, Leadership)

# Print the new data frame to ensure it contains the expected composite scores
print(composite_scores)
## # A tibble: 2,651 × 5
##    Alignment Collaboration Engagement Inclusion Leadership
##        <dbl>         <dbl>      <dbl>     <dbl>      <dbl>
##  1      4.67          3           5         4.6       4.75
##  2      4             4           3.6       3.8       4.75
##  3      4.67          4.67        4.4       5         5   
##  4      5             5           5         4.4       5   
##  5      4.67          5           4.4       4.8       5   
##  6      3.67          4           3.8       3.4       3.25
##  7      3.67          3           3.4       4         5   
##  8      4.67          4.67        4.6       4.6       5   
##  9      4.67          4.33        5         4.8       5   
## 10      5             4.67        4.4       4.6       4.5 
## # ℹ 2,641 more rows
composite_factor_scores <- subset(composite_scores, select= c(Alignment, Collaboration, Inclusion, Leadership))
print(composite_factor_scores)
## # A tibble: 2,651 × 4
##    Alignment Collaboration Inclusion Leadership
##        <dbl>         <dbl>     <dbl>      <dbl>
##  1      4.67          3          4.6       4.75
##  2      4             4          3.8       4.75
##  3      4.67          4.67       5         5   
##  4      5             5          4.4       5   
##  5      4.67          5          4.8       5   
##  6      3.67          4          3.4       3.25
##  7      3.67          3          4         5   
##  8      4.67          4.67       4.6       5   
##  9      4.67          4.33       4.8       5   
## 10      5             4.67       4.6       4.5 
## # ℹ 2,641 more rows
engagement_factor_scores <- subset(composite_scores, select= -c(Alignment, Collaboration, Inclusion, Leadership))
print(engagement_factor_scores)
## # A tibble: 2,651 × 1
##    Engagement
##         <dbl>
##  1        5  
##  2        3.6
##  3        4.4
##  4        5  
##  5        4.4
##  6        3.8
##  7        3.4
##  8        4.6
##  9        5  
## 10        4.4
## # ℹ 2,641 more rows
#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(composite_scores, method = "kendall", use = "complete.obs")

#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "Engagement"]

engagement_factor_correlation_table_kendall <- data.frame(
  variable = names(engagement_correlations_kendall),
  correlation = engagement_correlations_kendall
)

#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
  engagement_factor_correlation_table_kendall$variable != "Engagement", 
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
  arrange(correlation)

#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[1:4, ]

# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
  engagement_factor_correlation_table_kendall$variable,
  levels = engagement_factor_correlation_table_kendall$variable
)

# Use scales package to format correlations as percentages
library(scales)


# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Factors' relationships with Employee Engagement", caption = "Correlations calculated using Kendall's tau-b"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )



3.4.2 For women only (correlation calculated by conducting kendall’s tau-b & Pearson’s r)



library(dplyr)

# Subset df_2 to include only female employees
df_2_female <- df_2 %>%
  filter(gender == "Female")

# Calculate the engagement factor for female employees
df_2_female$engagement_factor <- rowMeans(df_2_female[, c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5")], na.rm = TRUE)


numeric_df_2 <- Filter(is.numeric, df_2_female)
library(dplyr)
numeric_df_2 <- subset(numeric_df_2, select= -c(eng_1, eng_2, eng_3, eng_4, eng_5))


#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(numeric_df_2, method = "kendall", use = "complete.obs")

#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "engagement_factor"]

engagement_factor_correlation_table_kendall <- data.frame(
  variable = names(engagement_correlations_kendall),
  correlation = engagement_correlations_kendall
)

#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
  engagement_factor_correlation_table_kendall$variable != "engagement_factor", 
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
  arrange(correlation)

#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[13:15, ]

# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
  engagement_factor_correlation_table_kendall$variable,
  levels = engagement_factor_correlation_table_kendall$variable
)

# Use scales package to format correlations as percentages
library(scales)


# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for Women", caption = "Correlations calculated using Kendall's tau-b"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

# Recreate the Pearson's r correlation matrix to see if they are too divergent
correlation_matrix <- cor(numeric_df_2, use = "complete.obs")
corrplot(correlation_matrix, type = "upper", order = "hclust")

# Extract correlations for "engagement_factor"
engagement_correlations <- correlation_matrix[, "engagement_factor"]

engagement_factor_correlation_table <- data.frame(
  variable = names(engagement_correlations),
  correlation = engagement_correlations
)

engagement_factor_correlation_table <- engagement_factor_correlation_table[
  engagement_factor_correlation_table$variable != "engagement_factor", 
]

# Sort in descending order to get the strongest correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table %>%
  arrange(correlation)

# Keep only the top 3 correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table[13:15,]

# Reorder the factor levels
engagement_factor_correlation_table$variable <- factor(
  engagement_factor_correlation_table$variable,
  levels = engagement_factor_correlation_table$variable
)


# Create a ggplot with horizontal bars for the top 3 Pearson's r correlations
ggplot(engagement_factor_correlation_table, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for Women", caption = "Correlations calculated using Pearson's r"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

#engagement factor's correlations with other factors
library(dplyr)

#Create a df with composite factor scores for each group of items
composite_scores <- df_2[df_2$gender == "Female", ] %>%
  mutate(
    Alignment = rowMeans(select(., ali_1:ali_3), na.rm = TRUE),  # Alignment score
    Collaboration = rowMeans(select(., col_1:col_3), na.rm = TRUE),  # Collaboration score
    Engagement = rowMeans(select(., eng_1:eng_5), na.rm = TRUE),  # Engagement score
    Inclusion = rowMeans(select(., inc_1:inc_5), na.rm = TRUE),  # Inclusion score
    Leadership = rowMeans(select(., lea_1:lea_4), na.rm = TRUE)  # Leadership score
  ) %>%
  # Select only the composite factor scores and an identifier (if needed)
  select(Alignment, Collaboration, Engagement, Inclusion, Leadership)

# Print the new data frame to ensure it contains the expected composite scores
print(composite_scores)
## # A tibble: 821 × 5
##    Alignment Collaboration Engagement Inclusion Leadership
##        <dbl>         <dbl>      <dbl>     <dbl>      <dbl>
##  1      3.67          4           3.8       3.4       3.25
##  2      5             4.67        4.4       4.6       4.5 
##  3      4.33          4.33        4.8       4.6       5   
##  4      4.33          4.33        5         4.4       4.25
##  5      4.33          4           3.8       4.6       4.75
##  6      4.33          4           4.2       3.4       4   
##  7      5             5           5         4.6       5   
##  8      2             3.33        2.6       2.8       4   
##  9      4.67          4.33        3.6       5         4.25
## 10      4             4           4.4       4.4       4   
## # ℹ 811 more rows
composite_factor_scores <- subset(composite_scores, select= c(Alignment, Collaboration, Inclusion, Leadership))
print(composite_factor_scores)
## # A tibble: 821 × 4
##    Alignment Collaboration Inclusion Leadership
##        <dbl>         <dbl>     <dbl>      <dbl>
##  1      3.67          4          3.4       3.25
##  2      5             4.67       4.6       4.5 
##  3      4.33          4.33       4.6       5   
##  4      4.33          4.33       4.4       4.25
##  5      4.33          4          4.6       4.75
##  6      4.33          4          3.4       4   
##  7      5             5          4.6       5   
##  8      2             3.33       2.8       4   
##  9      4.67          4.33       5         4.25
## 10      4             4          4.4       4   
## # ℹ 811 more rows
engagement_factor_scores <- subset(composite_scores, select= -c(Alignment, Collaboration, Inclusion, Leadership))
print(engagement_factor_scores)
## # A tibble: 821 × 1
##    Engagement
##         <dbl>
##  1        3.8
##  2        4.4
##  3        4.8
##  4        5  
##  5        3.8
##  6        4.2
##  7        5  
##  8        2.6
##  9        3.6
## 10        4.4
## # ℹ 811 more rows
#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(composite_scores, method = "kendall", use = "complete.obs")

#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "Engagement"]

engagement_factor_correlation_table_kendall <- data.frame(
  variable = names(engagement_correlations_kendall),
  correlation = engagement_correlations_kendall
)

#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
  engagement_factor_correlation_table_kendall$variable != "Engagement", 
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
  arrange(correlation)

#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[1:4, ]

# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
  engagement_factor_correlation_table_kendall$variable,
  levels = engagement_factor_correlation_table_kendall$variable
)

# Use scales package to format correlations as percentages
library(scales)


# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Factors' relationships with Employee Engagement for Women", caption = "Correlations calculated using Kendall's tau-b"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )



3.4.3 For men only (correlation calculated by conducting kendall’s tau-b & Pearson’s r)



library(dplyr)

# Subset df_2 to include only male employees
df_2_male <- df_2 %>%
  filter(gender == "Male")

# Calculate the engagement factor for female employees
df_2_male$engagement_factor <- rowMeans(df_2_male[, c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5")], na.rm = TRUE)

# Display a few rows to check the new variable
head(df_2_male)
numeric_df_2 <- Filter(is.numeric, df_2_male)
library(dplyr)
numeric_df_2 <- subset(numeric_df_2, select= -c(eng_1, eng_2, eng_3, eng_4, eng_5))


#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(numeric_df_2, method = "kendall", use = "complete.obs")

#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "engagement_factor"]

engagement_factor_correlation_table_kendall <- data.frame(
  variable = names(engagement_correlations_kendall),
  correlation = engagement_correlations_kendall
)

#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
  engagement_factor_correlation_table_kendall$variable != "engagement_factor", 
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
  arrange(correlation)

#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[13:15, ]

# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
  engagement_factor_correlation_table_kendall$variable,
  levels = engagement_factor_correlation_table_kendall$variable
)

# Use scales package to format correlations as percentages
library(scales)


# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for Men", caption = "Correlations calculated using Kendall's tau-b"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

# Recreate the Pearson's r correlation matrix to see if they are too divergent
correlation_matrix <- cor(numeric_df_2, use = "complete.obs")
corrplot(correlation_matrix, type = "upper", order = "hclust")

# Extract correlations for "engagement_factor"
engagement_correlations <- correlation_matrix[, "engagement_factor"]

engagement_factor_correlation_table <- data.frame(
  variable = names(engagement_correlations),
  correlation = engagement_correlations
)

engagement_factor_correlation_table <- engagement_factor_correlation_table[
  engagement_factor_correlation_table$variable != "engagement_factor", 
]

# Sort in descending order to get the strongest correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table %>%
  arrange(correlation)

# Keep only the top 3 correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table[13:15,]

# Reorder the factor levels
engagement_factor_correlation_table$variable <- factor(
  engagement_factor_correlation_table$variable,
  levels = engagement_factor_correlation_table$variable
)


# Create a ggplot with horizontal bars for the top 3 Pearson's r correlations
ggplot(engagement_factor_correlation_table, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for Men", caption = "Correlations calculated using Pearson's r"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

#engagement factor's correlations with other factors
library(dplyr)

#Create a df with composite factor scores for each group of items
composite_scores <- df_2[df_2$gender == "Male", ] %>%
  mutate(
    Alignment = rowMeans(select(., ali_1:ali_3), na.rm = TRUE),  # Alignment score
    Collaboration = rowMeans(select(., col_1:col_3), na.rm = TRUE),  # Collaboration score
    Engagement = rowMeans(select(., eng_1:eng_5), na.rm = TRUE),  # Engagement score
    Inclusion = rowMeans(select(., inc_1:inc_5), na.rm = TRUE),  # Inclusion score
    Leadership = rowMeans(select(., lea_1:lea_4), na.rm = TRUE)  # Leadership score
  ) %>%
  # Select only the composite factor scores and an identifier (if needed)
  select(Alignment, Collaboration, Engagement, Inclusion, Leadership)

# Print the new data frame to ensure it contains the expected composite scores
print(composite_scores)
## # A tibble: 1,831 × 5
##    Alignment Collaboration Engagement Inclusion Leadership
##        <dbl>         <dbl>      <dbl>     <dbl>      <dbl>
##  1      4.67          3           5         4.6       4.75
##  2      4             4           3.6       3.8       4.75
##  3      4.67          4.67        4.4       5         5   
##  4      5             5           5         4.4       5   
##  5      4.67          5           4.4       4.8       5   
##  6      3.67          3           3.4       4         5   
##  7      4.67          4.67        4.6       4.6       5   
##  8      4.67          4.33        5         4.8       5   
##  9      4.67          5           4         5         5   
## 10      4             3.67        4         3.6       4   
## # ℹ 1,821 more rows
composite_factor_scores <- subset(composite_scores, select= c(Alignment, Collaboration, Inclusion, Leadership))
print(composite_factor_scores)
## # A tibble: 1,831 × 4
##    Alignment Collaboration Inclusion Leadership
##        <dbl>         <dbl>     <dbl>      <dbl>
##  1      4.67          3          4.6       4.75
##  2      4             4          3.8       4.75
##  3      4.67          4.67       5         5   
##  4      5             5          4.4       5   
##  5      4.67          5          4.8       5   
##  6      3.67          3          4         5   
##  7      4.67          4.67       4.6       5   
##  8      4.67          4.33       4.8       5   
##  9      4.67          5          5         5   
## 10      4             3.67       3.6       4   
## # ℹ 1,821 more rows
engagement_factor_scores <- subset(composite_scores, select= -c(Alignment, Collaboration, Inclusion, Leadership))
print(engagement_factor_scores)
## # A tibble: 1,831 × 1
##    Engagement
##         <dbl>
##  1        5  
##  2        3.6
##  3        4.4
##  4        5  
##  5        4.4
##  6        3.4
##  7        4.6
##  8        5  
##  9        4  
## 10        4  
## # ℹ 1,821 more rows
#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(composite_scores, method = "kendall", use = "complete.obs")

#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "Engagement"]

engagement_factor_correlation_table_kendall <- data.frame(
  variable = names(engagement_correlations_kendall),
  correlation = engagement_correlations_kendall
)

#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
  engagement_factor_correlation_table_kendall$variable != "Engagement", 
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
  arrange(correlation)

#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[1:4, ]

# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
  engagement_factor_correlation_table_kendall$variable,
  levels = engagement_factor_correlation_table_kendall$variable
)

# Use scales package to format correlations as percentages
library(scales)


# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Factors' relationships with Employee Engagement for Men", caption = "Correlations calculated using Kendall's tau-b"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )



3.4.4 For Indian employees (correlation calculated by conducting kendall’s tau-b & Pearson’s r)



library(dplyr)

# Subset df_2 to include only Indian employees
df_2_india <- df_2 %>%
  filter(country == "India")

# Calculate the engagement factor for female employees
df_2_india$engagement_factor <- rowMeans(df_2_india[, c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5")], na.rm = TRUE)

# Display a few rows to check the new variable
head(df_2_india)
numeric_df_2 <- Filter(is.numeric, df_2_india)
library(dplyr)
numeric_df_2 <- subset(numeric_df_2, select= -c(eng_1, eng_2, eng_3, eng_4, eng_5))


#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(numeric_df_2, method = "kendall", use = "complete.obs")

#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "engagement_factor"]

engagement_factor_correlation_table_kendall <- data.frame(
  variable = names(engagement_correlations_kendall),
  correlation = engagement_correlations_kendall
)

#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
  engagement_factor_correlation_table_kendall$variable != "engagement_factor", 
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
  arrange(correlation)

#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[13:15, ]

# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
  engagement_factor_correlation_table_kendall$variable,
  levels = engagement_factor_correlation_table_kendall$variable
)

# Use scales package to format correlations as percentages
library(scales)


# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for Indian Employees", caption = "Correlations calculated using Kendall's tau-b"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

# Recreate the Pearson's r correlation matrix to see if they are too divergent
correlation_matrix <- cor(numeric_df_2, use = "complete.obs")
corrplot(correlation_matrix, type = "upper", order = "hclust")

# Extract correlations for "engagement_factor"
engagement_correlations <- correlation_matrix[, "engagement_factor"]

engagement_factor_correlation_table <- data.frame(
  variable = names(engagement_correlations),
  correlation = engagement_correlations
)

engagement_factor_correlation_table <- engagement_factor_correlation_table[
  engagement_factor_correlation_table$variable != "engagement_factor", 
]

# Sort in descending order to get the strongest correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table %>%
  arrange(correlation)

# Keep only the top 3 correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table[13:15,]

# Reorder the factor levels
engagement_factor_correlation_table$variable <- factor(
  engagement_factor_correlation_table$variable,
  levels = engagement_factor_correlation_table$variable
)


# Create a ggplot with horizontal bars for the top 3 Pearson's r correlations
ggplot(engagement_factor_correlation_table, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for Indian Employees", caption = "Correlations calculated using Pearson's r"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

#engagement factor's correlations with other factors
library(dplyr)

#Create a df with composite factor scores for each group of items
composite_scores <- df_2[df_2$country == "India", ] %>%
  mutate(
    Alignment = rowMeans(select(., ali_1:ali_3), na.rm = TRUE),  # Alignment score
    Collaboration = rowMeans(select(., col_1:col_3), na.rm = TRUE),  # Collaboration score
    Engagement = rowMeans(select(., eng_1:eng_5), na.rm = TRUE),  # Engagement score
    Inclusion = rowMeans(select(., inc_1:inc_5), na.rm = TRUE),  # Inclusion score
    Leadership = rowMeans(select(., lea_1:lea_4), na.rm = TRUE)  # Leadership score
  ) %>%
  # Select only the composite factor scores and an identifier (if needed)
  select(Alignment, Collaboration, Engagement, Inclusion, Leadership)

# Print the new data frame to ensure it contains the expected composite scores
print(composite_scores)
## # A tibble: 288 × 5
##    Alignment Collaboration Engagement Inclusion Leadership
##        <dbl>         <dbl>      <dbl>     <dbl>      <dbl>
##  1      4.67          5           4         5         5   
##  2      4             4           4.2       5         5   
##  3      5             5           4.6       5         5   
##  4      4.67          4.33        3.6       5         4.25
##  5      5             5           5         5         5   
##  6      3.33          4           4.8       3.6       4   
##  7      4.67          5           4.2       5         5   
##  8      3             3.67        4.6       4.2       3.5 
##  9      4             4           4.8       4         4   
## 10      4.67          5           5         5         5   
## # ℹ 278 more rows
composite_factor_scores <- subset(composite_scores, select= c(Alignment, Collaboration, Inclusion, Leadership))
print(composite_factor_scores)
## # A tibble: 288 × 4
##    Alignment Collaboration Inclusion Leadership
##        <dbl>         <dbl>     <dbl>      <dbl>
##  1      4.67          5          5         5   
##  2      4             4          5         5   
##  3      5             5          5         5   
##  4      4.67          4.33       5         4.25
##  5      5             5          5         5   
##  6      3.33          4          3.6       4   
##  7      4.67          5          5         5   
##  8      3             3.67       4.2       3.5 
##  9      4             4          4         4   
## 10      4.67          5          5         5   
## # ℹ 278 more rows
engagement_factor_scores <- subset(composite_scores, select= -c(Alignment, Collaboration, Inclusion, Leadership))
print(engagement_factor_scores)
## # A tibble: 288 × 1
##    Engagement
##         <dbl>
##  1        4  
##  2        4.2
##  3        4.6
##  4        3.6
##  5        5  
##  6        4.8
##  7        4.2
##  8        4.6
##  9        4.8
## 10        5  
## # ℹ 278 more rows
#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(composite_scores, method = "kendall", use = "complete.obs")

#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "Engagement"]

engagement_factor_correlation_table_kendall <- data.frame(
  variable = names(engagement_correlations_kendall),
  correlation = engagement_correlations_kendall
)

#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
  engagement_factor_correlation_table_kendall$variable != "Engagement", 
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
  arrange(correlation)

#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[1:4, ]

# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
  engagement_factor_correlation_table_kendall$variable,
  levels = engagement_factor_correlation_table_kendall$variable
)

# Use scales package to format correlations as percentages
library(scales)


# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Factors' relationships with Employee Engagement for Indian Employees", caption = "Correlations calculated using Kendall's tau-b"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )



3.4.5 For German employees (correlation calculated by conducting kendall’s tau-b & Pearson’s r)



library(dplyr)

# Subset df_2 to include only Indian employees
df_2_germany <- df_2 %>%
  filter(country == "Germany")

# Calculate the engagement factor for german employees
df_2_germany$engagement_factor <- rowMeans(df_2_germany[, c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5")], na.rm = TRUE)

# Display a few rows to check the new variable
head(df_2_germany)
numeric_df_2 <- Filter(is.numeric, df_2_germany)
library(dplyr)
numeric_df_2 <- subset(numeric_df_2, select= -c(eng_1, eng_2, eng_3, eng_4, eng_5))


#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(numeric_df_2, method = "kendall", use = "complete.obs")

#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "engagement_factor"]

engagement_factor_correlation_table_kendall <- data.frame(
  variable = names(engagement_correlations_kendall),
  correlation = engagement_correlations_kendall
)

#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
  engagement_factor_correlation_table_kendall$variable != "engagement_factor", 
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
  arrange(correlation)

#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[13:15, ]

# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
  engagement_factor_correlation_table_kendall$variable,
  levels = engagement_factor_correlation_table_kendall$variable
)

# Use scales package to format correlations as percentages
library(scales)


# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for German Employees", caption = "Correlations calculated using Kendall's tau-b"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

# Recreate the Pearson's r correlation matrix to see if they are too divergent
correlation_matrix <- cor(numeric_df_2, use = "complete.obs")
corrplot(correlation_matrix, type = "upper", order = "hclust")

# Extract correlations for "engagement_factor"
engagement_correlations <- correlation_matrix[, "engagement_factor"]

engagement_factor_correlation_table <- data.frame(
  variable = names(engagement_correlations),
  correlation = engagement_correlations
)

engagement_factor_correlation_table <- engagement_factor_correlation_table[
  engagement_factor_correlation_table$variable != "engagement_factor", 
]

# Sort in descending order to get the strongest correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table %>%
  arrange(correlation)

# Keep only the top 3 correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table[13:15,]

# Reorder the factor levels
engagement_factor_correlation_table$variable <- factor(
  engagement_factor_correlation_table$variable,
  levels = engagement_factor_correlation_table$variable
)


# Create a ggplot with horizontal bars for the top 3 Pearson's r correlations
ggplot(engagement_factor_correlation_table, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for German Employees", caption = "Correlations calculated using Pearson's r"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

#engagement factor's correlations with other factors
library(dplyr)

#Create a df with composite factor scores for each group of items
composite_scores <- df_2[df_2$country == "Germany", ] %>%
  mutate(
    Alignment = rowMeans(select(., ali_1:ali_3), na.rm = TRUE),  # Alignment score
    Collaboration = rowMeans(select(., col_1:col_3), na.rm = TRUE),  # Collaboration score
    Engagement = rowMeans(select(., eng_1:eng_5), na.rm = TRUE),  # Engagement score
    Inclusion = rowMeans(select(., inc_1:inc_5), na.rm = TRUE),  # Inclusion score
    Leadership = rowMeans(select(., lea_1:lea_4), na.rm = TRUE)  # Leadership score
  ) %>%
  # Select only the composite factor scores and an identifier (if needed)
  select(Alignment, Collaboration, Engagement, Inclusion, Leadership)

# Print the new data frame to ensure it contains the expected composite scores
print(composite_scores)
## # A tibble: 48 × 5
##    Alignment Collaboration Engagement Inclusion Leadership
##        <dbl>         <dbl>      <dbl>     <dbl>      <dbl>
##  1      4.67          3.33        3.8       4         4   
##  2      4             5           4.4       3.8       5   
##  3      3.67          3           3.6       2.8       3.5 
##  4      3.67          3.33        3.4       3.6       4   
##  5      4.33          4           4.4       5         4.75
##  6      4.33          2.67        3.2       3.4       2.75
##  7      4             3.67        4.4       4.8       4.25
##  8      4             3.33        4         5         5   
##  9      5             4.33        4.6       5         5   
## 10      3.67          3           3         5         4   
## # ℹ 38 more rows
composite_factor_scores <- subset(composite_scores, select= c(Alignment, Collaboration, Inclusion, Leadership))
print(composite_factor_scores)
## # A tibble: 48 × 4
##    Alignment Collaboration Inclusion Leadership
##        <dbl>         <dbl>     <dbl>      <dbl>
##  1      4.67          3.33       4         4   
##  2      4             5          3.8       5   
##  3      3.67          3          2.8       3.5 
##  4      3.67          3.33       3.6       4   
##  5      4.33          4          5         4.75
##  6      4.33          2.67       3.4       2.75
##  7      4             3.67       4.8       4.25
##  8      4             3.33       5         5   
##  9      5             4.33       5         5   
## 10      3.67          3          5         4   
## # ℹ 38 more rows
engagement_factor_scores <- subset(composite_scores, select= -c(Alignment, Collaboration, Inclusion, Leadership))
print(engagement_factor_scores)
## # A tibble: 48 × 1
##    Engagement
##         <dbl>
##  1        3.8
##  2        4.4
##  3        3.6
##  4        3.4
##  5        4.4
##  6        3.2
##  7        4.4
##  8        4  
##  9        4.6
## 10        3  
## # ℹ 38 more rows
#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(composite_scores, method = "kendall", use = "complete.obs")

#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "Engagement"]

engagement_factor_correlation_table_kendall <- data.frame(
  variable = names(engagement_correlations_kendall),
  correlation = engagement_correlations_kendall
)

#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
  engagement_factor_correlation_table_kendall$variable != "Engagement", 
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
  arrange(correlation)

#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[1:4, ]

# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
  engagement_factor_correlation_table_kendall$variable,
  levels = engagement_factor_correlation_table_kendall$variable
)

# Use scales package to format correlations as percentages
library(scales)


# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Factors' relationships with Employee Engagement for German Employees", caption = "Correlations calculated using Kendall's tau-b"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )



3.4.6 For employees who are high on employee_engagement (correlation calculated by conducting kendall’s tau-b & Pearson’s r)



This analysis included survey responses onlyfrom employees whose engagement factor composite score is equal to or greater than 4 only.

library(dplyr)

# Calculate the engagement factor for employees
df_2$engagement_factor <- rowMeans(df_2[, c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5")], na.rm = TRUE)

hist(df_2$engagement_factor)

high_engagement <- df_2 %>%
  filter(engagement_factor >= 4)

numeric_df_2 <- Filter(is.numeric, high_engagement)
library(dplyr)
numeric_df_2 <- subset(numeric_df_2, select= -c(eng_1, eng_2, eng_3, eng_4, eng_5))


#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(numeric_df_2, method = "kendall", use = "complete.obs")

#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "engagement_factor"]

engagement_factor_correlation_table_kendall <- data.frame(
  variable = names(engagement_correlations_kendall),
  correlation = engagement_correlations_kendall
)

#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
  engagement_factor_correlation_table_kendall$variable != "engagement_factor", 
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
  arrange(correlation)

#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[13:15, ]

# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
  engagement_factor_correlation_table_kendall$variable,
  levels = engagement_factor_correlation_table_kendall$variable
)

# Use scales package to format correlations as percentages
library(scales)


# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for highly engaging employees", caption = "Correlations calculated using Kendall's tau-b | employees with engagement composite factor score greater than or equal to 4 are only included for analysis"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

# Recreate the Pearson's r correlation matrix to see if they are too divergent
correlation_matrix <- cor(numeric_df_2, use = "complete.obs")
corrplot(correlation_matrix, type = "upper", order = "hclust")

# Extract correlations for "engagement_factor"
engagement_correlations <- correlation_matrix[, "engagement_factor"]

engagement_factor_correlation_table <- data.frame(
  variable = names(engagement_correlations),
  correlation = engagement_correlations
)

engagement_factor_correlation_table <- engagement_factor_correlation_table[
  engagement_factor_correlation_table$variable != "engagement_factor", 
]

# Sort in descending order to get the strongest correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table %>%
  arrange(correlation)

# Keep only the top 3 correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table[13:15,]

# Reorder the factor levels
engagement_factor_correlation_table$variable <- factor(
  engagement_factor_correlation_table$variable,
  levels = engagement_factor_correlation_table$variable
)


# Create a ggplot with horizontal bars for the top 3 Pearson's r correlations
ggplot(engagement_factor_correlation_table, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for Highly Engaged Employees", caption = "Correlations calculated using Pearson's r | employees with engagement composite factor score greater than or equal to 4 are only included for analysis"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

#engagement factor's correlations with other factors
library(dplyr)

#Create a df with composite factor scores for each group of items
composite_scores <- df_2 %>% mutate(
    Alignment = rowMeans(select(., ali_1:ali_3), na.rm = TRUE),  # Alignment score
    Collaboration = rowMeans(select(., col_1:col_3), na.rm = TRUE),  # Collaboration score
    Engagement = rowMeans(select(., eng_1:eng_5), na.rm = TRUE),  # Engagement score
    Inclusion = rowMeans(select(., inc_1:inc_5), na.rm = TRUE),  # Inclusion score
    Leadership = rowMeans(select(., lea_1:lea_4), na.rm = TRUE)  # Leadership score
  ) %>%
  # Select only the composite factor scores and an identifier (if needed)
  select(Alignment, Collaboration, Engagement, Inclusion, Leadership)

# Print the new data frame to ensure it contains the expected composite scores
print(composite_scores)
## # A tibble: 2,651 × 5
##    Alignment Collaboration Engagement Inclusion Leadership
##        <dbl>         <dbl>      <dbl>     <dbl>      <dbl>
##  1      4.67          3           5         4.6       4.75
##  2      4             4           3.6       3.8       4.75
##  3      4.67          4.67        4.4       5         5   
##  4      5             5           5         4.4       5   
##  5      4.67          5           4.4       4.8       5   
##  6      3.67          4           3.8       3.4       3.25
##  7      3.67          3           3.4       4         5   
##  8      4.67          4.67        4.6       4.6       5   
##  9      4.67          4.33        5         4.8       5   
## 10      5             4.67        4.4       4.6       4.5 
## # ℹ 2,641 more rows
composite_scores_high <- composite_scores %>%
  filter(Engagement >= 4)

composite_factor_scores <- subset(composite_scores_high, select= c(Alignment, Collaboration, Inclusion, Leadership))
print(composite_factor_scores)
## # A tibble: 1,819 × 4
##    Alignment Collaboration Inclusion Leadership
##        <dbl>         <dbl>     <dbl>      <dbl>
##  1      4.67          3          4.6       4.75
##  2      4.67          4.67       5         5   
##  3      5             5          4.4       5   
##  4      4.67          5          4.8       5   
##  5      4.67          4.67       4.6       5   
##  6      4.67          4.33       4.8       5   
##  7      5             4.67       4.6       4.5 
##  8      4.67          5          5         5   
##  9      4             3.67       3.6       4   
## 10    NaN           NaN        NaN       NaN   
## # ℹ 1,809 more rows
engagement_factor_scores <- subset(composite_scores_high, select= -c(Alignment, Collaboration, Inclusion, Leadership))
print(engagement_factor_scores)
## # A tibble: 1,819 × 1
##    Engagement
##         <dbl>
##  1        5  
##  2        4.4
##  3        5  
##  4        4.4
##  5        4.6
##  6        5  
##  7        4.4
##  8        4  
##  9        4  
## 10        4  
## # ℹ 1,809 more rows
#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(composite_scores, method = "kendall", use = "complete.obs")

#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "Engagement"]

engagement_factor_correlation_table_kendall <- data.frame(
  variable = names(engagement_correlations_kendall),
  correlation = engagement_correlations_kendall
)

#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
  engagement_factor_correlation_table_kendall$variable != "Engagement", 
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
  arrange(correlation)

#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[1:4, ]

# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
  engagement_factor_correlation_table_kendall$variable,
  levels = engagement_factor_correlation_table_kendall$variable
)

# Use scales package to format correlations as percentages
library(scales)


# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Factors' relationships with Employee Engagement for Highly Engaged Employees", caption = "Correlations calculated using Kendall's tau-b | employees with engagement composite factor score greater than or equal to 4 are only included for analysis" 
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )



3.4.7 For employees who are low on employee_engagement (correlation calculated by conducting kendall’s tau-b & Pearson’s r)



This analysis included survey responses only from those employees whose engagement factor composite score is smaller than or equal to 2. n = 47

library(dplyr)

# Calculate the engagement factor for employees
df_2$engagement_factor <- rowMeans(df_2[, c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5")], na.rm = TRUE)

hist(df_2$engagement_factor)

low_engagement <- df_2 %>%
  filter(engagement_factor <= 2)

numeric_df_2 <- Filter(is.numeric, low_engagement)
library(dplyr)
numeric_df_2 <- subset(numeric_df_2, select= -c(eng_1, eng_2, eng_3, eng_4, eng_5))


#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(numeric_df_2, method = "kendall", use = "complete.obs")

#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "engagement_factor"]

engagement_factor_correlation_table_kendall <- data.frame(
  variable = names(engagement_correlations_kendall),
  correlation = engagement_correlations_kendall
)

#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
  engagement_factor_correlation_table_kendall$variable != "engagement_factor", 
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
  arrange(correlation)

#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[13:15, ]

# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
  engagement_factor_correlation_table_kendall$variable,
  levels = engagement_factor_correlation_table_kendall$variable
)

# Use scales package to format correlations as percentages
library(scales)


# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for Less Engaged employees", caption = "Correlations calculated using Kendall's tau-b | Only employees with engagement composite factor score lower than or equal to 2 are included for this analysis"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

# Recreate the Pearson's r correlation matrix to see if they are too divergent
correlation_matrix <- cor(numeric_df_2, use = "complete.obs")
corrplot(correlation_matrix, type = "upper", order = "hclust")

# Extract correlations for "engagement_factor"
engagement_correlations <- correlation_matrix[, "engagement_factor"]

engagement_factor_correlation_table <- data.frame(
  variable = names(engagement_correlations),
  correlation = engagement_correlations
)

engagement_factor_correlation_table <- engagement_factor_correlation_table[
  engagement_factor_correlation_table$variable != "engagement_factor", 
]

# Sort in descending order to get the strongest correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table %>%
  arrange(correlation)

# Keep only the top 3 correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table[13:15,]

# Reorder the factor levels
engagement_factor_correlation_table$variable <- factor(
  engagement_factor_correlation_table$variable,
  levels = engagement_factor_correlation_table$variable
)


# Create a ggplot with horizontal bars for the top 3 Pearson's r correlations
ggplot(engagement_factor_correlation_table, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for Less Engaged Employees", caption = "Correlations calculated using Pearson's r | Only employees with engagement composite factor score lower than or equal to 2 are included for this analysis"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

#engagement factor's correlations with other factors
library(dplyr)

#Create a df with composite factor scores for each group of items
composite_scores <- df_2 %>% mutate(
    Alignment = rowMeans(select(., ali_1:ali_3), na.rm = TRUE),  # Alignment score
    Collaboration = rowMeans(select(., col_1:col_3), na.rm = TRUE),  # Collaboration score
    Engagement = rowMeans(select(., eng_1:eng_5), na.rm = TRUE),  # Engagement score
    Inclusion = rowMeans(select(., inc_1:inc_5), na.rm = TRUE),  # Inclusion score
    Leadership = rowMeans(select(., lea_1:lea_4), na.rm = TRUE)  # Leadership score
  ) %>%
  # Select only the composite factor scores and an identifier (if needed)
  select(Alignment, Collaboration, Engagement, Inclusion, Leadership)

# Print the new data frame to ensure it contains the expected composite scores
print(composite_scores)
## # A tibble: 2,651 × 5
##    Alignment Collaboration Engagement Inclusion Leadership
##        <dbl>         <dbl>      <dbl>     <dbl>      <dbl>
##  1      4.67          3           5         4.6       4.75
##  2      4             4           3.6       3.8       4.75
##  3      4.67          4.67        4.4       5         5   
##  4      5             5           5         4.4       5   
##  5      4.67          5           4.4       4.8       5   
##  6      3.67          4           3.8       3.4       3.25
##  7      3.67          3           3.4       4         5   
##  8      4.67          4.67        4.6       4.6       5   
##  9      4.67          4.33        5         4.8       5   
## 10      5             4.67        4.4       4.6       4.5 
## # ℹ 2,641 more rows
composite_scores_low <- composite_scores %>%
  filter(Engagement <= 2)

composite_factor_scores <- subset(composite_scores_low, select= c(Alignment, Collaboration, Inclusion, Leadership))
print(composite_factor_scores)
## # A tibble: 47 × 4
##    Alignment Collaboration Inclusion Leadership
##        <dbl>         <dbl>     <dbl>      <dbl>
##  1      3.33          4          4.2       1.75
##  2      2             1          2.2       3.25
##  3      2.33          3.33       2         2   
##  4      3             4          3.6       3   
##  5      1.67          4          2.8       2.5 
##  6      2.33          2          3         1   
##  7      2.33          3          3.8       3   
##  8      1.67          3.67       2.6       4.5 
##  9      3             2.67       3         2.5 
## 10      1.67          2.67       2.8       2.75
## # ℹ 37 more rows
engagement_factor_scores <- subset(composite_scores_low, select= -c(Alignment, Collaboration, Inclusion, Leadership))
print(engagement_factor_scores)
## # A tibble: 47 × 1
##    Engagement
##         <dbl>
##  1        1.8
##  2        1.2
##  3        1.8
##  4        2  
##  5        2  
##  6        1.4
##  7        2  
##  8        1.8
##  9        2  
## 10        1.8
## # ℹ 37 more rows
#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(composite_scores, method = "kendall", use = "complete.obs")

#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "Engagement"]

engagement_factor_correlation_table_kendall <- data.frame(
  variable = names(engagement_correlations_kendall),
  correlation = engagement_correlations_kendall
)

#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
  engagement_factor_correlation_table_kendall$variable != "Engagement", 
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
  arrange(correlation)

#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[1:4, ]

# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
  engagement_factor_correlation_table_kendall$variable,
  levels = engagement_factor_correlation_table_kendall$variable
)

# Use scales package to format correlations as percentages
library(scales)


# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Factors' relationships with Employee Engagement for Less Engaged Employees", caption = "Correlations calculated using Kendall's tau-b | Only employees with engagement composite factor score lower than or equal to 2 are included for analysis" 
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )



3.5 Calculate Engagement & Leadership Factor Scores for All Countries.



3.5.1 Creating a graph comparing factor favorable scores for engagement & leadership across countries



#Create an empty data frame to save the consolidated favorable_scores
consolidated_favorable_scores <- data.frame(
  factor_abbreviation = character(),
  factor_favorable_score = numeric(),
  country = character(),
  stringsAsFactors = FALSE
)

#list of countries
countries <- c("Australia", "Canada", "China", "Denmark", "France", "Germany", "India", "United Kingdom", "United States")  

#Loop through each country and calculate factor favorable score
for (country in countries) {
  #Filter data for the specific country
  country_df <- df_2[df_2$country == country & !is.na(df_2$country), ]
  
  #Calculate favorability_score using a custom function
  favorability_score <- function_favorability(country_df[, c(8:12, 18:21)])
  colnames(favorability_score)[1] <- "question_number"
  
  factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number))
  
  #Create a data frame for this country's factor_favorable_score (so a dataframe within a loop)
  factor_favorable_score <- data.frame(
    factor_abbreviation = character(),
    factor_favorable_score = numeric(),
    country = character(),
    stringsAsFactors = FALSE
  )
  
  #Calculate the average favorable score for each factor
  for (factor_abbr in factor_abbreviations) {
    #Subset favorable scores for the factor
    factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    
    #Calculate the average favorable score
    avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)
    
    #Append the calculated score to the country's data frame
    factor_favorable_score <- rbind(
      factor_favorable_score,
      data.frame(
        factor_abbreviation = factor_abbr,
        factor_favorable_score = avg_favorable_score,
        country = country  # Add the country label
      )
    )
  }
  
  #Consolidate this country's data frame into the main consolidated data frame
  consolidated_favorable_scores <- rbind(
    consolidated_favorable_scores,
    factor_favorable_score
  )
}

print(consolidated_favorable_scores)
##    factor_abbreviation factor_favorable_score        country
## 1                  eng              0.8277228      Australia
## 2                  lea              0.9257426      Australia
## 3                  eng              0.8380952         Canada
## 4                  lea              0.8625932         Canada
## 5                  eng              0.8066667          China
## 6                  lea              0.8500000          China
## 7                  eng              0.8083333        Denmark
## 8                  lea              0.8618659        Denmark
## 9                  eng              0.8338235         France
## 10                 lea              0.9301471         France
## 11                 eng              0.6962766        Germany
## 12                 lea              0.8645833        Germany
## 13                 eng              0.8440634          India
## 14                 lea              0.9308116          India
## 15                 eng              0.8099251 United Kingdom
## 16                 lea              0.8995405 United Kingdom
## 17                 eng              0.7786955  United States
## 18                 lea              0.8336043  United States
#Sort the data frame by factor_favorable_score 
consolidated_favorable_scores <- consolidated_favorable_scores %>%
  arrange(desc(factor_favorable_score))

#Reorder the factor levels based on the sorted data frame
consolidated_favorable_scores$factor_abbreviation <- factor(
  consolidated_favorable_scores$factor_abbreviation,
  levels = unique(consolidated_favorable_scores$factor_abbreviation)  
)
# Load necessary libraries
library(ggplot2)
library(dplyr)
library(scales)

# Filter the data into two subsets: one for 'lea' (leadership factor) and one for 'eng' (engagement factor)
leadership_scores <- consolidated_favorable_scores %>%
  filter(factor_abbreviation == "lea")

engagement_scores <- consolidated_favorable_scores %>%
  filter(factor_abbreviation == "eng")

# Reorder the factor levels based on the sorted data frame
leadership_scores$country <- factor(
  leadership_scores$country,
  levels = unique(leadership_scores$country)
)

engagement_scores$country <- factor(
  engagement_scores$country,
  levels = unique(engagement_scores$country)
)

# Create ggplot for leadership scores
ggplot(leadership_scores, aes(x = country, y = factor_favorable_score * 100, fill = country)) +
  geom_bar(stat = "identity", position = "dodge") +  # Bars side-by-side
  geom_text(
    aes(label = paste0(round(factor_favorable_score * 100, 1), "%")),
    position = position_dodge(width = 0.9),
    vjust = -0.5  # Position the text above the bars
  ) +
  labs(
    x = "Country",
    y = "Leadership Favorability Score (%)",
    title = "Leadership Factor Favorable Scores by Country (%)"
  ) +
  scale_y_continuous(labels = percent_format(scale = 1)) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5)
  )

# Create ggplot for engagement scores
ggplot(engagement_scores, aes(x = country, y = factor_favorable_score * 100, fill = country)) +
  geom_bar(stat = "identity", position = "dodge") +  # Bars side-by-side
  geom_text(
    aes(label = paste0(round(factor_favorable_score * 100, 1), "%")),
    position = position_dodge(width = 0.9),
      vjust = -0.5,
    size = 3
  ) +
  labs(
    x = "Country",
    y = "Engagement Favorability Score (%)",
    title = "Engagement Factor Favorable Scores by Country (%)"
  ) +
  scale_y_continuous(labels = percent_format(scale = 1)) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5)
  )

# Create a dataframe to save the consolidated factor scores
consolidated_favorable_scores <- data.frame(
  factor_abbreviation = character(),
  factor_favorable_score = numeric(),
  country = character(),
  stringsAsFactors = FALSE
)

# List of countries
countries <- c("Australia", "Canada", "China", "Denmark", "France", "Germany", "India", "United Kingdom", "United States")

# Loop through each country and calculate factor favorable score
for (country in countries) {
  # Filter data for the specific country
  country_df <- df_2[df_2$country == country & !is.na(df_2$country), ]

  # Calculate favorability_score using a custom function for leadership factor
  favorability_score <- function_favorability(country_df[, c(18:21)])  # 'lea' questions
  colnames(favorability_score)[1] <- "question_number"

  # Create a data frame for this country's factor_favorable_score
  factor_favorable_score <- data.frame(
    factor_abbreviation = character(),
    factor_favorable_score = numeric(),
    country = character(),
    stringsAsFactors = FALSE
  )

  # Calculate the average favorable score for each factor
  for (factor_abbr in unique(sub("_.*", "", favorability_score$question_number))) {
    # Subset favorable scores for the factor
    factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    
    # Calculate the average favorable score
    avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)
    
    # Append the calculated score to the data frame
    factor_favorable_score <- rbind(
      factor_favorable_score,
      data.frame(
        factor_abbreviation = factor_abbr,
        factor_favorable_score = avg_favorable_score,
        country = country  # Add the country label
      )
    )
  }

  # Consolidate this country's data frame into the main consolidated data frame
  consolidated_favorable_scores <- rbind(
    consolidated_favorable_scores,
    factor_favorable_score
  )
}

# Now calculate the company's average leadership score based on all employees
all_employees_df <- df_2  # Full dataset
company_favorability_score <- function_favorability(all_employees_df[, c(18:21)])  # 'lea' questions
company_avg_leadership_score <- mean(company_favorability_score$favorability_score, na.rm = TRUE)  # Average leadership factor

# Create a row for the company's average score
company_average_row <- data.frame(
  factor_abbreviation = "lea",
  factor_favorable_score = company_avg_leadership_score,
  country = "Company Average"  # Label for the overall company
)

# Add the company's average to the consolidated data frame
consolidated_favorable_scores <- rbind(company_average_row, consolidated_favorable_scores)

# Define the desired order for the countries
desired_order <- c("Company Average", "India", "France", "Australia", "United Kingdom", "Germany", "Canada", "Denmark", "China", "United States")

# Reorder the factor levels for proper display
consolidated_favorable_scores$country <- factor(
  consolidated_favorable_scores$country,
  levels = desired_order  # 'Company Average' first, then other countries in the desired order
)

# Create a ggplot for leadership scores including the company average
ggplot(consolidated_favorable_scores, aes(x = country, y = factor_favorable_score * 100, fill = country)) +
  geom_bar(stat = "identity", position = "dodge") +  # Bars side-by-side
  geom_text(
    aes(label = paste0(round(factor_favorable_score * 100, 1), "%")),
    position = position_dodge(width = 0.9),
    vjust = -0.5,
    size = 3
  ) +
  labs(
    x = "Country",
    y = "Leadership Favorability Score (%)",
    title = "Leadership Factor Favorable Scores by Country (%)"
  ) +
  scale_y_continuous(labels = scales::percent_format(scale = 1)) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    axis.text.x = element_text(angle = 45, hjust = 1)  # Rotate x-axis labels for readability
  )

library(ggplot2)
library(dplyr)
library(scales)

# Create a dataframe to save the consolidated factor scores
consolidated_favorable_scores <- data.frame(
  factor_abbreviation = character(),
  factor_favorable_score = numeric(),
  country = character(),
  stringsAsFactors = FALSE
)

# List of countries
countries <- c("India", "Canada", "France", "Australia", "United Kingdom", "Denmark", "China", "United States", "Germany")

# Loop through each country and calculate factor favorable score
for (country in countries) {
  # Filter data for the specific country
  country_df <- df_2[df_2$country == country & !is.na(df_2$country), ]

  # Calculate favorability_score using a custom function for 'eng' questions (Engagement)
  favorability_score <- function_favorability(country_df[, c(8:12)])
  colnames(favorability_score)[1] <- "question_number"

  # Create a data frame for this country's factor_favorable_score
  factor_favorable_score <- data.frame(
    factor_abbreviation = "eng",
    factor_favorable_score = mean(favorability_score$favorability_score, na.rm = TRUE),
    country = country
  )

  # Consolidate this country's data frame into the main consolidated data frame
  consolidated_favorable_scores <- rbind(consolidated_favorable_scores, factor_favorable_score)
}


# Now calculate the company's average leadership score based on all employees
all_employees_df <- df_2  # Full dataset
company_favorability_score <- function_favorability(all_employees_df[, c(8:12)])  # 'eng' questions
company_avg_engagement_score <- mean(company_favorability_score$favorability_score, na.rm = TRUE)  # Average engagement factor

# Create a row for the company's average score
company_average_row <- data.frame(
  factor_abbreviation = "eng",
  factor_favorable_score = company_avg_engagement_score,
  country = "Company Average"  # Label for the overall company
)


# Add the company's average engagement score to the consolidated data frame
company_average_row <- data.frame(
  factor_abbreviation = "eng",
  factor_favorable_score = company_avg_engagement_score,
  country = "Company Average"
)
consolidated_favorable_scores <- rbind(company_average_row, consolidated_favorable_scores)

# Define the desired order for the countries
desired_order <- c("Company Average", "India", "Canada", "France", "Australia", "United Kingdom", "Denmark", "China", "United States", "Germany")

# Reorder the countries for proper display
consolidated_favorable_scores$country <- factor(
  consolidated_favorable_scores$country,
  levels = desired_order
)

# Create ggplot for engagement scores
ggplot(consolidated_favorable_scores, aes(x = country, y = factor_favorable_score * 100, fill = country)) +
  geom_bar(stat = "identity", position = "dodge") +  # Bars side-by-side
  geom_text(
    aes(label = paste0(round(factor_favorable_score * 100, 1), "%")),  
    position = position_dodge(width = 0.9),  
    vjust = -0.5,  
    size = 3  
  ) +
  labs(
    x = "Country",
    y = "Engagement Favorability Score (%)",
    title = "Engagement Factor Favorable Scores by Country (%)"
  ) +
  scale_y_continuous(labels = percent_format(scale = 1)) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5)  
  )



3.5.2 Australia’s



n = 101

df_2[df_2$country == "Australia" & !is.na(df_2$country), ]
# Calculate favorability_score for columns 8 to 12 (engagement questions) by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$country == "Australia", c(8:12, 18:21)])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1          0.9207921
## 2           eng_2          0.9306931
## 3           eng_3          0.8514851
## 4           eng_4          0.6930693
## 5           eng_5          0.7425743
## 6           lea_1          0.8613861
## 7           lea_2          0.9207921
## 8           lea_3          0.9603960
## 9           lea_4          0.9603960
# Calculate factor favorability score for engagement & leadership 

## Extract factor abbreviations first
factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number)) #substitute "-.*" to " " then only save unique abbreviations

## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(
    factor_favorable_score,
    data.frame(
      factor_abbreviation = factor_abbr,
      factor_favorable_score = avg_favorable_score,
      country = "Australia"  # Add the country label
    )
  )
}




print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score   country
## 1                 eng              0.8277228 Australia
## 2                 lea              0.9257426 Australia
favorability_score$Country <- "Australia" 

#let's combine these two dfs
colnames(factor_favorable_score) <- c("question_number", "favorability_score", "Country")
combined_favorability_df <- rbind(favorability_score, factor_favorable_score)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score   Country
## 10             eng          0.8277228 Australia
## 1            eng_1          0.9207921 Australia
## 2            eng_2          0.9306931 Australia
## 3            eng_3          0.8514851 Australia
## 4            eng_4          0.6930693 Australia
## 5            eng_5          0.7425743 Australia
## 11             lea          0.9257426 Australia
## 6            lea_1          0.8613861 Australia
## 7            lea_2          0.9207921 Australia
## 8            lea_3          0.9603960 Australia
## 9            lea_4          0.9603960 Australia
#let's only keep engagement scores

engagement_favorable_scores <- combined_favorability_df[1:6, ] 
print(engagement_favorable_scores)
##    question_number favorability_score   Country
## 10             eng          0.8277228 Australia
## 1            eng_1          0.9207921 Australia
## 2            eng_2          0.9306931 Australia
## 3            eng_3          0.8514851 Australia
## 4            eng_4          0.6930693 Australia
## 5            eng_5          0.7425743 Australia
# Distinguish the average row from other item rows
engagement_favorable_scores$highlight <- ifelse(engagement_favorable_scores$question_number == engagement_favorable_scores $question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category


library(scales)

# Create a ggplot that shows engagement factor favorable scores for Aussie employees
ggplot(engagement_favorable_scores, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE, width = 0.6) +  
  labs(x = "Factor Items", y = "Favorability Score (%)", title = "Engagement Factor Favorable Scores for Aussie Employees") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(round(favorability_score * 100), "%")), vjust = -0.5, size = 4) + 
  scale_fill_manual(values = highlight_colors) +  
  scale_y_continuous(labels = percent_format(accuracy = 1)) 



3.5.3 Canada’s



n = 84

df_2[df_2$country == "Canada" & !is.na(df_2$country), ]
# Calculate favorability_score for columns 8 to 12 (engagement questions) by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$country == "Canada", c(8:12, 18:21)])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1          0.9285714
## 2           eng_2          0.8333333
## 3           eng_3          0.8690476
## 4           eng_4          0.7619048
## 5           eng_5          0.7976190
## 6           lea_1          0.8452381
## 7           lea_2          0.8333333
## 8           lea_3          0.9404762
## 9           lea_4          0.8313253
# Calculate factor favorability score for engagement & leadership 

## Extract factor abbreviations first
factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number)) #substitute "-.*" to " " then only save unique abbreviations

## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(
    factor_favorable_score,
    data.frame(
      factor_abbreviation = factor_abbr,
      factor_favorable_score = avg_favorable_score,
      country = "Canada"  # Add the country label
    )
  )
}




print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score country
## 1                 eng              0.8380952  Canada
## 2                 lea              0.8625932  Canada
favorability_score$Country <- "Canada" 

#let's combine these two dfs
colnames(factor_favorable_score) <- c("question_number", "favorability_score", "Country")
combined_favorability_df <- rbind(favorability_score, factor_favorable_score)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score Country
## 10             eng          0.8380952  Canada
## 1            eng_1          0.9285714  Canada
## 2            eng_2          0.8333333  Canada
## 3            eng_3          0.8690476  Canada
## 4            eng_4          0.7619048  Canada
## 5            eng_5          0.7976190  Canada
## 11             lea          0.8625932  Canada
## 6            lea_1          0.8452381  Canada
## 7            lea_2          0.8333333  Canada
## 8            lea_3          0.9404762  Canada
## 9            lea_4          0.8313253  Canada
#let's only keep engagement scores

engagement_favorable_scores <- combined_favorability_df[1:6, ] 
print(engagement_favorable_scores)
##    question_number favorability_score Country
## 10             eng          0.8380952  Canada
## 1            eng_1          0.9285714  Canada
## 2            eng_2          0.8333333  Canada
## 3            eng_3          0.8690476  Canada
## 4            eng_4          0.7619048  Canada
## 5            eng_5          0.7976190  Canada
# Distinguish the average row from other item rows
engagement_favorable_scores$highlight <- ifelse(engagement_favorable_scores$question_number == engagement_favorable_scores $question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category


library(scales)

# Create a ggplot that shows engagement factor favorable scores for Danish employees
ggplot(engagement_favorable_scores, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE, width = 0.6) +  
  labs(x = "Factor Items", y = "Favorability Score (%)", title = "Engagement Factor Favorable Scores for Canadian Employees") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(round(favorability_score * 100), "%")), vjust = -0.5, size = 4) + 
  scale_fill_manual(values = highlight_colors) +  
  scale_y_continuous(labels = percent_format(accuracy = 1))



3.5.4 China’s



n = 60

df_2[df_2$country == "China" & !is.na(df_2$country), ]
# Calculate favorability_score for columns 8 to 12 (engagement questions) by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$country == "China", c(8:12, 18:21)])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1          0.8666667
## 2           eng_2          0.7833333
## 3           eng_3          0.7833333
## 4           eng_4          0.7833333
## 5           eng_5          0.8166667
## 6           lea_1          0.8333333
## 7           lea_2          0.8166667
## 8           lea_3          0.8833333
## 9           lea_4          0.8666667
# Calculate factor favorability score for engagement & leadership 

## Extract factor abbreviations first
factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number)) #substitute "-.*" to " " then only save unique abbreviations

## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(
    factor_favorable_score,
    data.frame(
      factor_abbreviation = factor_abbr,
      factor_favorable_score = avg_favorable_score,
      country = "China"  # Add the country label
    )
  )
}




print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score country
## 1                 eng              0.8066667   China
## 2                 lea              0.8500000   China
favorability_score$Country <- "China" 

#let's combine these two dfs
colnames(factor_favorable_score) <- c("question_number", "favorability_score", "Country")
combined_favorability_df <- rbind(favorability_score, factor_favorable_score)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score Country
## 10             eng          0.8066667   China
## 1            eng_1          0.8666667   China
## 2            eng_2          0.7833333   China
## 3            eng_3          0.7833333   China
## 4            eng_4          0.7833333   China
## 5            eng_5          0.8166667   China
## 11             lea          0.8500000   China
## 6            lea_1          0.8333333   China
## 7            lea_2          0.8166667   China
## 8            lea_3          0.8833333   China
## 9            lea_4          0.8666667   China
#let's only keep engagement scores

engagement_favorable_scores <- combined_favorability_df[1:6, ] 
print(engagement_favorable_scores)
##    question_number favorability_score Country
## 10             eng          0.8066667   China
## 1            eng_1          0.8666667   China
## 2            eng_2          0.7833333   China
## 3            eng_3          0.7833333   China
## 4            eng_4          0.7833333   China
## 5            eng_5          0.8166667   China
# Distinguish the average row from other item rows
engagement_favorable_scores$highlight <- ifelse(engagement_favorable_scores$question_number == engagement_favorable_scores $question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category


library(scales)

# Create a ggplot that shows engagement factor favorable scores for Chinese employees
ggplot(engagement_favorable_scores, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE, width = 0.6) +  
  labs(x = "Factor Items", y = "Favorability Score (%)", title = "Engagement Factor Favorable Scores for Chinese Employees") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(round(favorability_score * 100), "%")), vjust = -0.5, size = 4) + 
  scale_fill_manual(values = highlight_colors) +  
  scale_y_continuous(labels = percent_format(accuracy = 1))



3.5.5 Denmark’s



n = 24

df_2[df_2$country == "Denmark" & !is.na(df_2$country), ]
# Calculate favorability_score for columns 8 to 12 (engagement questions) by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$country == "Denmark", c(8:12, 18:21)])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1          0.9583333
## 2           eng_2          0.7916667
## 3           eng_3          0.8333333
## 4           eng_4          0.7083333
## 5           eng_5          0.7500000
## 6           lea_1          0.8333333
## 7           lea_2          0.8750000
## 8           lea_3          0.8695652
## 9           lea_4          0.8695652
# Calculate factor favorability score for engagement & leadership 

## Extract factor abbreviations first
factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number)) #substitute "-.*" to " " then only save unique abbreviations

## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(
    factor_favorable_score,
    data.frame(
      factor_abbreviation = factor_abbr,
      factor_favorable_score = avg_favorable_score,
      country = "Denmark"  # Add the country label
    )
  )
}



print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score country
## 1                 eng              0.8083333 Denmark
## 2                 lea              0.8618659 Denmark
favorability_score$Country <- "Denmark" 

#let's combine these two dfs
colnames(factor_favorable_score) <- c("question_number", "favorability_score", "Country")
combined_favorability_df <- rbind(favorability_score, factor_favorable_score)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score Country
## 10             eng          0.8083333 Denmark
## 1            eng_1          0.9583333 Denmark
## 2            eng_2          0.7916667 Denmark
## 3            eng_3          0.8333333 Denmark
## 4            eng_4          0.7083333 Denmark
## 5            eng_5          0.7500000 Denmark
## 11             lea          0.8618659 Denmark
## 6            lea_1          0.8333333 Denmark
## 7            lea_2          0.8750000 Denmark
## 8            lea_3          0.8695652 Denmark
## 9            lea_4          0.8695652 Denmark
#let's only keep engagement scores

engagement_favorable_scores <- combined_favorability_df[1:6, ] 
print(engagement_favorable_scores)
##    question_number favorability_score Country
## 10             eng          0.8083333 Denmark
## 1            eng_1          0.9583333 Denmark
## 2            eng_2          0.7916667 Denmark
## 3            eng_3          0.8333333 Denmark
## 4            eng_4          0.7083333 Denmark
## 5            eng_5          0.7500000 Denmark
# Distinguish the average row from other item rows
engagement_favorable_scores$highlight <- ifelse(engagement_favorable_scores$question_number == engagement_favorable_scores $question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category


library(scales)

# Create a ggplot that shows engagement factor favorable scores for Danish employees
ggplot(engagement_favorable_scores, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE, width = 0.6) +  
  labs(x = "Factor Items", y = "Favorability Score (%)", title = "Engagement Factor Favorable Scores for Danish Employees") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(round(favorability_score * 100), "%")), vjust = -0.5, size = 4) + 
  scale_fill_manual(values = highlight_colors) +  
  scale_y_continuous(labels = percent_format(accuracy = 1))  



3.5.6 France’s



n = 136

df_2[df_2$country == "France" & !is.na(df_2$country), ]
# Calculate favorability_score for columns 8 to 12 (engagement questions) by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$country == "France", c(8:12, 18:21)])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1          0.9338235
## 2           eng_2          0.8602941
## 3           eng_3          0.8750000
## 4           eng_4          0.7279412
## 5           eng_5          0.7720588
## 6           lea_1          0.9264706
## 7           lea_2          0.9044118
## 8           lea_3          0.9411765
## 9           lea_4          0.9485294
# Calculate factor favorability score for engagement & leadership 

## Extract factor abbreviations first
factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number)) #substitute "-.*" to " " then only save unique abbreviations

## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(
    factor_favorable_score,
    data.frame(
      factor_abbreviation = factor_abbr,
      factor_favorable_score = avg_favorable_score,
      country = "France"  # Add the country label
    )
  )
}



print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score country
## 1                 eng              0.8338235  France
## 2                 lea              0.9301471  France
favorability_score$Country <- "France" 

#let's combine these two dfs
colnames(factor_favorable_score) <- c("question_number", "favorability_score", "Country")
combined_favorability_df <- rbind(favorability_score, factor_favorable_score)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score Country
## 10             eng          0.8338235  France
## 1            eng_1          0.9338235  France
## 2            eng_2          0.8602941  France
## 3            eng_3          0.8750000  France
## 4            eng_4          0.7279412  France
## 5            eng_5          0.7720588  France
## 11             lea          0.9301471  France
## 6            lea_1          0.9264706  France
## 7            lea_2          0.9044118  France
## 8            lea_3          0.9411765  France
## 9            lea_4          0.9485294  France
#let's only keep engagement scores

engagement_favorable_scores <- combined_favorability_df[1:6, ] 
print(engagement_favorable_scores)
##    question_number favorability_score Country
## 10             eng          0.8338235  France
## 1            eng_1          0.9338235  France
## 2            eng_2          0.8602941  France
## 3            eng_3          0.8750000  France
## 4            eng_4          0.7279412  France
## 5            eng_5          0.7720588  France
# Distinguish the average row from other item rows
engagement_favorable_scores$highlight <- ifelse(engagement_favorable_scores$question_number == engagement_favorable_scores $question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category


library(scales)

# Create a ggplot that shows engagement factor favorable scores for French employees
ggplot(engagement_favorable_scores, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE, width = 0.6) +  
  labs(x = "Factor Items", y = "Favorability Score (%)", title = "Engagement Factor Favorable Scores for French Employees") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(round(favorability_score * 100), "%")), vjust = -0.5, size = 4) + 
  scale_fill_manual(values = highlight_colors) +  
  scale_y_continuous(labels = percent_format(accuracy = 1))  



3.5.7 Germany’s



n = 48

df_2[df_2$country == "Germany" & !is.na(df_2$country), ]
# Calculate favorability_score for columns 8 to 12 (engagement questions) by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$country == "Germany", c(8:12, 18:21)])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1          0.8333333
## 2           eng_2          0.8333333
## 3           eng_3          0.7083333
## 4           eng_4          0.5106383
## 5           eng_5          0.5957447
## 6           lea_1          0.8750000
## 7           lea_2          0.7916667
## 8           lea_3          0.9375000
## 9           lea_4          0.8541667
# Calculate factor favorability score for engagement & leadership 

## Extract factor abbreviations first
factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number)) #substitute "-.*" to " " then only save unique abbreviations

## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(
    factor_favorable_score,
    data.frame(
      factor_abbreviation = factor_abbr,
      factor_favorable_score = avg_favorable_score,
      country = "Germany"  # Add the country label
    )
  )
}


print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score country
## 1                 eng              0.6962766 Germany
## 2                 lea              0.8645833 Germany
favorability_score$Country <- "Germany" 

#let's combine these two dfs
colnames(factor_favorable_score) <- c("question_number", "favorability_score", "Country")
combined_favorability_df <- rbind(favorability_score, factor_favorable_score)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score Country
## 10             eng          0.6962766 Germany
## 1            eng_1          0.8333333 Germany
## 2            eng_2          0.8333333 Germany
## 3            eng_3          0.7083333 Germany
## 4            eng_4          0.5106383 Germany
## 5            eng_5          0.5957447 Germany
## 11             lea          0.8645833 Germany
## 6            lea_1          0.8750000 Germany
## 7            lea_2          0.7916667 Germany
## 8            lea_3          0.9375000 Germany
## 9            lea_4          0.8541667 Germany
#let's only keep engagement scores

engagement_favorable_scores <- combined_favorability_df[1:6, ] 
print(engagement_favorable_scores)
##    question_number favorability_score Country
## 10             eng          0.6962766 Germany
## 1            eng_1          0.8333333 Germany
## 2            eng_2          0.8333333 Germany
## 3            eng_3          0.7083333 Germany
## 4            eng_4          0.5106383 Germany
## 5            eng_5          0.5957447 Germany
# Distinguish the average row from other item rows
engagement_favorable_scores$highlight <- ifelse(engagement_favorable_scores$question_number == engagement_favorable_scores $question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category


library(scales)

# Create a ggplot that shows engagement factor favorable scores for German employees
ggplot(engagement_favorable_scores, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE, width = 0.6) +  
  labs(x = "Factor Items", y = "Favorability Score (%)", title = "Engagement Factor Favorable Scores for German Employees") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(round(favorability_score * 100), "%")), vjust = -0.5, size = 4) + 
  scale_fill_manual(values = highlight_colors) +  
  scale_y_continuous(labels = percent_format(accuracy = 1))  

#Desired order of question_number in reverse as I am making a horizontal graph
desired_order <- c("eng_5", "eng_4", "eng_3", "eng_2", "eng_1", "eng")

#Set the factor levels in reverse order
engagement_favorable_scores$question_number <- factor(
  engagement_favorable_scores$question_number,
  levels = (desired_order)  # Reverse the desired order
)

#Distinguish the average row from other item rows
engagement_favorable_scores$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_scores$question_number[1], "first", "other")




#Color palette for highlighting
highlight_colors <- c("first" = "#9A36B2",  # Purple for the 'first' category
                      "other" = "#C9A5DD")  # light purple for the 'other' category

#Create a horizontal bar plot with ggplot
ggplot(engagement_favorable_scores, aes(x = question_number, y = favorability_score*100, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) + 
  coord_flip() +  
  labs(
    x = "Factor Items",  
    y = "Favorability Score (%)", 
    title = "Engagement Factor Favorable Scores for Employees in Germany",
    caption = "Favorability score represents the percentage of respondents who agreed or strongly agreed with the question item over all non-missing responses. Factor favorability score is an average of all the items for the factor."  
  ) +
  theme_minimal() + 
  theme(
    axis.text.y = element_text(size = 12),  
    axis.text.x = element_text(size = 12), 
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),  
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14),
  plot.caption = element_text(size = 6, hjust = 0)  
  ) +
  geom_text(aes(label = paste0(round(favorability_score*100), "%")), 
            hjust = -0.3,  
            size = 4) +  
  scale_fill_manual(values = highlight_colors)  



3.5.8 India’s



n = 288

df_2[df_2$country == "India" & !is.na(df_2$country), ]
# Calculate favorability_score for columns 8 to 12 (engagement questions) by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$country == "India", c(8:12, 18:21)])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1          0.9442509
## 2           eng_2          0.9166667
## 3           eng_3          0.8767606
## 4           eng_4          0.7152778
## 5           eng_5          0.7673611
## 6           lea_1          0.8982456
## 7           lea_2          0.9125874
## 8           lea_3          0.9581882
## 9           lea_4          0.9542254
# Calculate factor favorability score for engagement & leadership 

## Extract factor abbreviations first
factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number)) #substitute "-.*" to " " then only save unique abbreviations

## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(
    factor_favorable_score,
    data.frame(
      factor_abbreviation = factor_abbr,
      factor_favorable_score = avg_favorable_score,
      country = "India"  # Add the country label
    )
  )
}


print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score country
## 1                 eng              0.8440634   India
## 2                 lea              0.9308116   India
favorability_score$Country <- "India" 

#let's combine these two dfs
colnames(factor_favorable_score) <- c("question_number", "favorability_score", "Country")
combined_favorability_df <- rbind(favorability_score, factor_favorable_score)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score Country
## 10             eng          0.8440634   India
## 1            eng_1          0.9442509   India
## 2            eng_2          0.9166667   India
## 3            eng_3          0.8767606   India
## 4            eng_4          0.7152778   India
## 5            eng_5          0.7673611   India
## 11             lea          0.9308116   India
## 6            lea_1          0.8982456   India
## 7            lea_2          0.9125874   India
## 8            lea_3          0.9581882   India
## 9            lea_4          0.9542254   India
#let's only keep engagement scores

engagement_favorable_scores <- combined_favorability_df[1:6, ] 
print(engagement_favorable_scores)
##    question_number favorability_score Country
## 10             eng          0.8440634   India
## 1            eng_1          0.9442509   India
## 2            eng_2          0.9166667   India
## 3            eng_3          0.8767606   India
## 4            eng_4          0.7152778   India
## 5            eng_5          0.7673611   India
# Distinguish the average row from other item rows
engagement_favorable_scores$highlight <- ifelse(engagement_favorable_scores$question_number == engagement_favorable_scores $question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category


library(scales)

# Create a ggplot that shows engagement factor favorable scores for Indian employees
ggplot(engagement_favorable_scores, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE, width = 0.6) +  
  labs(x = "Factor Items", y = "Favorability Score (%)", title = "Engagement Factor Favorable Scores for Indian Employees") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(round(favorability_score * 100), "%")), vjust = -0.5, size = 4) + 
  scale_fill_manual(values = highlight_colors) +  
  scale_y_continuous(labels = percent_format(accuracy = 1))  



3.5.9 United Kingdom’s



n = 359

df_2[df_2$country == "United Kingdom" & !is.na(df_2$country), ]
# Calculate favorability_score for columns 8 to 12 (engagement questions) by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$country == "United Kingdom", c(8:12, 18:21)])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1          0.9385475
## 2           eng_2          0.8770950
## 3           eng_3          0.8328691
## 4           eng_4          0.6796657
## 5           eng_5          0.7214485
## 6           lea_1          0.8743017
## 7           lea_2          0.8659218
## 8           lea_3          0.9220056
## 9           lea_4          0.9359331
# Calculate factor favorability score for engagement & leadership 

## Extract factor abbreviations first
factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number)) #substitute "-.*" to " " then only save unique abbreviations

## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(
    factor_favorable_score,
    data.frame(
      factor_abbreviation = factor_abbr,
      factor_favorable_score = avg_favorable_score,
      country = "United Kingdom"  # Add the country label
    )
  )
}


print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score        country
## 1                 eng              0.8099251 United Kingdom
## 2                 lea              0.8995405 United Kingdom
favorability_score$Country <- "United Kingdom" 

#let's combine these two dfs
colnames(factor_favorable_score) <- c("question_number", "favorability_score", "Country")
combined_favorability_df <- rbind(favorability_score, factor_favorable_score)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Country
## 10             eng          0.8099251 United Kingdom
## 1            eng_1          0.9385475 United Kingdom
## 2            eng_2          0.8770950 United Kingdom
## 3            eng_3          0.8328691 United Kingdom
## 4            eng_4          0.6796657 United Kingdom
## 5            eng_5          0.7214485 United Kingdom
## 11             lea          0.8995405 United Kingdom
## 6            lea_1          0.8743017 United Kingdom
## 7            lea_2          0.8659218 United Kingdom
## 8            lea_3          0.9220056 United Kingdom
## 9            lea_4          0.9359331 United Kingdom
#let's only keep engagement scores

engagement_favorable_scores <- combined_favorability_df[1:6, ] 
print(engagement_favorable_scores)
##    question_number favorability_score        Country
## 10             eng          0.8099251 United Kingdom
## 1            eng_1          0.9385475 United Kingdom
## 2            eng_2          0.8770950 United Kingdom
## 3            eng_3          0.8328691 United Kingdom
## 4            eng_4          0.6796657 United Kingdom
## 5            eng_5          0.7214485 United Kingdom
# Distinguish the average row from other item rows
engagement_favorable_scores$highlight <- ifelse(engagement_favorable_scores$question_number == engagement_favorable_scores $question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category


library(scales)

# Create a ggplot that shows engagement factor favorable scores for U.S. employees
ggplot(engagement_favorable_scores, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE, width = 0.6) +  
  labs(x = "Factor Items", y = "Favorability Score (%)", title = "Engagement Factor Favorable Scores for U.K. Employees") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(round(favorability_score * 100), "%")), vjust = -0.5, size = 4) + 
  scale_fill_manual(values = highlight_colors) +  
  scale_y_continuous(labels = percent_format(accuracy = 1))  



3.5.10 United States’



n = 1551

df_2[df_2$country == "United States" & !is.na(df_2$country), ]
# Calculate favorability_score for columns 8 to 12 (engagement questions) by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$country == "United States", c(8:12, 18:21)])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1          0.9153200
## 2           eng_2          0.8312864
## 3           eng_3          0.7692806
## 4           eng_4          0.6735751
## 5           eng_5          0.7040155
## 6           lea_1          0.8000000
## 7           lea_2          0.8273616
## 8           lea_3          0.8654971
## 9           lea_4          0.8415584
# Calculate factor favorability score for engagement & leadership 

## Extract factor abbreviations first
factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number)) #substitute "-.*" to " " then only save unique abbreviations

## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(
    factor_favorable_score,
    data.frame(
      factor_abbreviation = factor_abbr,
      factor_favorable_score = avg_favorable_score,
      country = "United States"  # Add the country label
    )
  )
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score       country
## 1                 eng              0.7786955 United States
## 2                 lea              0.8336043 United States
favorability_score$Country <- "United States" 

#let's combine these two dfs
colnames(factor_favorable_score) <- c("question_number", "favorability_score", "Country")
combined_favorability_df <- rbind(favorability_score, factor_favorable_score)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score       Country
## 10             eng          0.7786955 United States
## 1            eng_1          0.9153200 United States
## 2            eng_2          0.8312864 United States
## 3            eng_3          0.7692806 United States
## 4            eng_4          0.6735751 United States
## 5            eng_5          0.7040155 United States
## 11             lea          0.8336043 United States
## 6            lea_1          0.8000000 United States
## 7            lea_2          0.8273616 United States
## 8            lea_3          0.8654971 United States
## 9            lea_4          0.8415584 United States
#let's only keep engagement scores

engagement_favorable_scores <- combined_favorability_df[1:6, ] 
print(engagement_favorable_scores)
##    question_number favorability_score       Country
## 10             eng          0.7786955 United States
## 1            eng_1          0.9153200 United States
## 2            eng_2          0.8312864 United States
## 3            eng_3          0.7692806 United States
## 4            eng_4          0.6735751 United States
## 5            eng_5          0.7040155 United States
# Distinguish the average row from other item rows
engagement_favorable_scores$highlight <- ifelse(engagement_favorable_scores$question_number == engagement_favorable_scores $question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category


library(scales)

# Create a ggplot that shows engagement factor favorable scores for U.S. employees
ggplot(engagement_favorable_scores, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE, width = 0.6) +  
  labs(x = "Factor Items", y = "Favorability Score (%)", title = "Engagement Factor Favorable Scores for U.S. Employees") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(round(favorability_score * 100), "%")), vjust = -0.5, size = 4) + 
  scale_fill_manual(values = highlight_colors) +  
  scale_y_continuous(labels = percent_format(accuracy = 1))  



3.6 Demographic Composition of the Company



3.6.1 By country



library(ggplot2)
library(dplyr)

country_df <- as.data.frame(table(df_2$country))
colnames(country_df) <- c("Country", "Count")


##bar graph
ggplot(country_df, aes(x = Country, y = Count, fill = Country)) +  
  geom_bar(stat = "identity") +  
  labs(
    x = "Country",  
    y = "Employee Count",  
    title = paste("Employee Distribution Across Countries"),
  caption = "Total: 2651 employees"  
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(hjust = 1, face = "italic")  
  ) +
  geom_text(aes(label = paste(Count)), vjust = -0.5, size = 3.5)  

# Calculate the proportion of employees for each country
country_df <- country_df %>%
  mutate(Proportion = Count / sum(Count))  

ggplot(country_df, aes(x = "", y = Proportion, fill = Country)) +
  geom_bar(stat = "identity", width = 1) +  
  coord_polar(theta = "y") + 
  labs(
    x = NULL, 
    y = NULL,  
    title = "Employee Distribution by Country (Percentage)"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_blank(),  
    panel.grid = element_blank() 
  ) +
  geom_text(aes(label = paste(Country, scales::percent(Proportion, accuracy = 0.1))), 
            position = position_stack(vjust = 0.5),  # Center text on each slice
            size = 3)  

# barchart in percentage

ggplot(country_df, aes(x = Country, y = Proportion*100, fill = Country)) +
  geom_bar(stat = "identity") +  
  labs(
    x = "Country", 
    y = "Proportion (%)",  
    title = "Employee Distribution by Country (Percentage)"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5)  
  ) +
  geom_text(aes(label = paste0(round(Proportion*100, 1), "%")),  
            vjust = -0.5, 
            size = 3.5)  



3.6.2 By gender



#by gender

gender_df <- as.data.frame(table(df_2$gender, useNA = "always"))
colnames(gender_df) <- c("Gender", "Count")
gender_df <- gender_df %>%
  mutate(
    Gender = case_when(
      Gender == "Female" ~ "Women",  
      Gender == "Male" ~ "Men",  
      is.na(Gender) ~ "Unknown",  
      TRUE ~ Gender  # Keep other values unchanged
    )
  )
gender_df
gender_df$Tenure <- factor(
    gender_df$Gender, 
    levels = c("Men", "Women", "Unknown"))

total_employees <- sum(gender_df$Count)

ggplot(gender_df, aes(x = "", y = Count, fill = Gender)) +
  geom_bar(stat = "identity", width = 1) +  
  coord_polar(theta = "y") +  
  labs(
    x = NULL,  
    y = NULL,  
    title = paste("Employee Distribution Across Gender (Total:", total_employees, "employees)")  # Add total employees in the title
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_blank(),  
    panel.grid = element_blank()  
  ) +
  geom_text(aes(label = paste(Gender, " : ", Count)),  
            position = position_stack(vjust = 0.5),  
            size = 3.5)

##bar graph
ggplot(gender_df, aes(x = Gender, y = Count, fill = Gender)) +  
  geom_bar(stat = "identity") +  
  labs(
    x = "Gender Group",  
    y = "Employee Count",  
    title = paste("Employee Distribution Across Gender Groups"),
  caption = "Total: 2651 employees"  
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(hjust = 1, face = "italic")  
  ) +
  geom_text(aes(label = paste(Count)), vjust = -0.5, size = 3.5)



3.6.3 By tenure



#by tenure group


tenure_df <- as.data.frame(table(df_2$tenure_group))
colnames(tenure_df) <- c("Tenure", "Count")

tenure_df$Tenure <- factor(
    tenure_df$Tenure, 
    levels = c("Under 3 months", "3-6 months", "6-12 months", "1-2 years", "2-4 years", "4-6 years", "6-10 years", "10+ years"))

    
tenure_df
total_employees <- sum(tenure_df$Count)

##bar graph
ggplot(tenure_df, aes(x = Tenure, y = Count, fill = Tenure)) +  
  geom_bar(stat = "identity") +  
  labs(
    x = "Tenure Group",  
    y = "Employee Count",  
    title = paste("Employee Distribution Across Tenure Groups"),
  caption = "Total: 2651 employees"  
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(hjust = 1, face = "italic")  
  ) +
  geom_text(aes(label = paste(Count)), vjust = -0.5, size = 3)  



3.6.4 By tenure & gender



#by tenure & gender

library(ggplot2)
library(dplyr)

#Create separate data frames for men and women by tenure group
men_tenure_df <- as.data.frame(table(df_2$tenure_group[df_2$gender == "Male"]))
women_tenure_df <- as.data.frame(table(df_2$tenure_group[df_2$gender == "Female"]))

colnames(men_tenure_df) <- c("Tenure", "Count")
colnames(women_tenure_df) <- c("Tenure", "Count")

#Ensure the tenure groups are in the correct order
tenure_levels <- c("Under 3 months", "3-6 months", "6-12 months", "1-2 years", "2-4 years", "4-6 years", "6-10 years", "10+ years")

men_tenure_df$Tenure <- factor(men_tenure_df$Tenure, levels = tenure_levels)
women_tenure_df$Tenure <- factor(women_tenure_df$Tenure, levels = tenure_levels)

#Add a gender column to each data frame
men_tenure_df$Gender <- "Men"
women_tenure_df$Gender <- "Women"

#Combine the two data frames
combined_tenure_df <- rbind(men_tenure_df, women_tenure_df)

#Create a bar plot comparing men and women by tenure group
ggplot(combined_tenure_df, aes(x = Tenure, y = Count, fill = Gender)) +  
  geom_bar(stat = "identity", position = "dodge") +  # Bars side-by-side
  labs(
    x = "Tenure Group",  
    y = "Employee Count",  
    title = "Employee Distribution by Tenure Group and Gender",
    caption = "Total: 2651 employees"  
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(hjust = 1, face = "italic")  
  ) +
  geom_text(aes(label = Count), position = position_dodge(width = 0.9), vjust = -0.5, size = 3)